diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
index 7d5c19872d5a85..2888e252267556 100644
--- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
+++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
@@ -136,8 +136,8 @@ struct Header {
   }
   StringRef verbatim() const { return std::get<Verbatim>(Storage); }
 
-  /// Absolute path for the header when it's a physical file. Otherwise just
-  /// the spelling without surrounding quotes/brackets.
+  /// For phiscal files, either absolute path or path relative to the execution
+  /// root. Otherwise just the spelling without surrounding quotes/brackets.
   llvm::StringRef resolvedPath() const;
 
 private:
diff --git a/clang-tools-extra/include-cleaner/lib/Analysis.cpp b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
index 68fe79d6929f6b..05e9d14734a95f 100644
--- a/clang-tools-extra/include-cleaner/lib/Analysis.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Analysis.cpp
@@ -82,7 +82,7 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
         const PragmaIncludes *PI, const Preprocessor &PP,
         llvm::function_ref<bool(llvm::StringRef)> HeaderFilter) {
   auto &SM = PP.getSourceManager();
-  const FileEntry *MainFile = SM.getFileEntryForID(SM.getMainFileID());
+  const auto MainFile = *SM.getFileEntryRefForID(SM.getMainFileID());
   llvm::DenseSet<const Include *> Used;
   llvm::StringSet<> Missing;
   if (!HeaderFilter)
@@ -95,7 +95,7 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
              for (const Header &H : Providers) {
                if (H.kind() == Header::Physical &&
                    (H.physical() == MainFile ||
-                    (ResourceDir && H.physical().getDir() == *ResourceDir))) {
+                    H.physical().getDir() == ResourceDir)) {
                  Satisfied = true;
                }
                for (const Include *I : Inc.match(H)) {
@@ -103,29 +103,30 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
                  Satisfied = true;
                }
              }
-             if (!Satisfied && !Providers.empty() &&
-                 Ref.RT == RefType::Explicit &&
-                 !HeaderFilter(Providers.front().resolvedPath())) {
-               // Check if we have any headers with the same spelling, in edge
-               // cases like `#include_next "foo.h"`, the user can't ever
-               // include the physical foo.h, but can have a spelling that
-               // refers to it.
-               auto Spelling = spellHeader(
-                   {Providers.front(), PP.getHeaderSearchInfo(), MainFile});
-               for (const Include *I : Inc.match(Header{Spelling})) {
-                 Used.insert(I);
-                 Satisfied = true;
-               }
-               if (!Satisfied)
-                 Missing.insert(std::move(Spelling));
+             // Bail out if we can't (or need not) insert an include.
+             if (Satisfied || Providers.empty() || Ref.RT != RefType::Explicit)
+               return;
+             if (HeaderFilter(Providers.front().resolvedPath()))
+               return;
+             // Check if we have any headers with the same spelling, in edge
+             // cases like `#include_next "foo.h"`, the user can't ever
+             // include the physical foo.h, but can have a spelling that
+             // refers to it.
+             auto Spelling = spellHeader(
+                 {Providers.front(), PP.getHeaderSearchInfo(), MainFile});
+             for (const Include *I : Inc.match(Header{Spelling})) {
+               Used.insert(I);
+               Satisfied = true;
              }
+             if (!Satisfied)
+               Missing.insert(std::move(Spelling));
            });
 
   AnalysisResults Results;
   for (const Include &I : Inc.all()) {
     if (Used.contains(&I) || !I.Resolved ||
-        HeaderFilter(I.Resolved->getFileEntry().tryGetRealPathName()) ||
-        (ResourceDir && I.Resolved->getFileEntry().getDir() == *ResourceDir))
+        HeaderFilter(I.Resolved->getName()) ||
+        I.Resolved->getDir() == ResourceDir)
       continue;
     if (PI) {
       if (PI->shouldKeep(*I.Resolved))
@@ -137,7 +138,7 @@ analyze(llvm::ArrayRef<Decl *> ASTRoots,
         // Since most private -> public mappings happen in a verbatim way, we
         // check textually here. This might go wrong in presence of symlinks or
         // header mappings. But that's not different than rest of the places.
-        if (MainFile->tryGetRealPathName().ends_with(PHeader))
+        if (MainFile.getName().ends_with(PHeader))
           continue;
       }
     }
diff --git a/clang-tools-extra/include-cleaner/lib/HTMLReport.cpp b/clang-tools-extra/include-cleaner/lib/HTMLReport.cpp
index 195f658a0af920..bbe8bc230c6e20 100644
--- a/clang-tools-extra/include-cleaner/lib/HTMLReport.cpp
+++ b/clang-tools-extra/include-cleaner/lib/HTMLReport.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AnalysisInternal.h"
+#include "clang-include-cleaner/IncludeSpeller.h"
 #include "clang-include-cleaner/Types.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/PrettyPrinter.h"
@@ -167,22 +168,6 @@ class Reporter {
     return "semiused";
   }
 
-  std::string spellHeader(const Header &H) {
-    switch (H.kind()) {
-    case Header::Physical: {
-      bool IsAngled = false;
-      std::string Path = HS.suggestPathToFileForDiagnostics(
-          H.physical(), MainFE->tryGetRealPathName(), &IsAngled);
-      return IsAngled ? "<" + Path + ">" : "\"" + Path + "\"";
-    }
-    case Header::Standard:
-      return H.standard().name().str();
-    case Header::Verbatim:
-      return H.verbatim().str();
-    }
-    llvm_unreachable("Unknown Header kind");
-  }
-
   void fillTarget(Ref &R) {
     // Duplicates logic from walkUsed(), which doesn't expose SymbolLocations.
     for (auto &Loc : locateSymbol(R.Sym))
@@ -204,7 +189,7 @@ class Reporter {
                      R.Includes.end());
 
     if (!R.Headers.empty())
-      R.Insert = spellHeader(R.Headers.front());
+      R.Insert = spellHeader({R.Headers.front(), HS, MainFE});
   }
 
 public:
diff --git a/clang-tools-extra/include-cleaner/lib/Types.cpp b/clang-tools-extra/include-cleaner/lib/Types.cpp
index cb8a55ed13e5d0..7a637639edf8b5 100644
--- a/clang-tools-extra/include-cleaner/lib/Types.cpp
+++ b/clang-tools-extra/include-cleaner/lib/Types.cpp
@@ -10,7 +10,6 @@
 #include "TypesInternal.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/FileEntry.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -48,7 +47,7 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Symbol &S) {
 llvm::StringRef Header::resolvedPath() const {
   switch (kind()) {
   case include_cleaner::Header::Physical:
-    return physical().getFileEntry().tryGetRealPathName();
+    return physical().getName();
   case include_cleaner::Header::Standard:
     return standard().name().trim("<>\"");
   case include_cleaner::Header::Verbatim:
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index 3bc449b0152bba..d8a44ab9b6e12e 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -164,7 +164,7 @@ class Action : public clang::ASTFrontendAction {
       writeHTML();
 
     llvm::StringRef Path =
-        SM.getFileEntryForID(SM.getMainFileID())->tryGetRealPathName();
+        SM.getFileEntryRefForID(SM.getMainFileID())->getName();
     assert(!Path.empty() && "Main file path not known?");
     llvm::StringRef Code = SM.getBufferData(SM.getMainFileID());
 
diff --git a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
index 5696c380758f85..43634ee8f2d803 100644
--- a/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/AnalysisTest.cpp
@@ -22,9 +22,12 @@
 #include "clang/Testing/TestAST.h"
 #include "clang/Tooling/Inclusions/StandardLibrary.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Testing/Annotations/Annotations.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -204,21 +207,37 @@ class AnalyzeTest : public testing::Test {
   TestInputs Inputs;
   PragmaIncludes PI;
   RecordedPP PP;
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> ExtraFS = nullptr;
+
   AnalyzeTest() {
     Inputs.MakeAction = [this] {
       struct Hook : public SyntaxOnlyAction {
       public:
-        Hook(RecordedPP &PP, PragmaIncludes &PI) : PP(PP), PI(PI) {}
+        Hook(RecordedPP &PP, PragmaIncludes &PI,
+             llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> ExtraFS)
+            : PP(PP), PI(PI), ExtraFS(std::move(ExtraFS)) {}
         bool BeginSourceFileAction(clang::CompilerInstance &CI) override {
           CI.getPreprocessor().addPPCallbacks(PP.record(CI.getPreprocessor()));
           PI.record(CI);
           return true;
         }
 
+        bool BeginInvocation(CompilerInstance &CI) override {
+          if (!ExtraFS)
+            return true;
+          auto OverlayFS =
+              llvm::makeIntrusiveRefCnt<llvm::vfs::OverlayFileSystem>(
+                  CI.getFileManager().getVirtualFileSystemPtr());
+          OverlayFS->pushOverlay(ExtraFS);
+          CI.getFileManager().setVirtualFileSystem(std::move(OverlayFS));
+          return true;
+        }
+
         RecordedPP &PP;
         PragmaIncludes &PI;
+        llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> ExtraFS;
       };
-      return std::make_unique<Hook>(PP, PI);
+      return std::make_unique<Hook>(PP, PI, ExtraFS);
     };
   }
 };
@@ -322,6 +341,58 @@ TEST_F(AnalyzeTest, DifferentHeaderSameSpelling) {
   EXPECT_THAT(Results.Missing, testing::IsEmpty());
 }
 
+TEST_F(AnalyzeTest, SpellingIncludesWithSymlinks) {
+  llvm::Annotations Code(R"cpp(
+  #include "header.h"
+  void $bar^bar() {
+    $foo^foo();
+  }
+  )cpp");
+  Inputs.Code = Code.code();
+  ExtraFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  ExtraFS->addFile("content_for/0", /*ModificationTime=*/{},
+                   llvm::MemoryBuffer::getMemBufferCopy(guard(R"cpp(
+  #include "inner.h"
+  )cpp")));
+  ExtraFS->addSymbolicLink("header.h", "content_for/0",
+                           /*ModificationTime=*/{});
+  ExtraFS->addFile("content_for/1", /*ModificationTime=*/{},
+                   llvm::MemoryBuffer::getMemBufferCopy(guard(R"cpp(
+  void foo();
+  )cpp")));
+  ExtraFS->addSymbolicLink("inner.h", "content_for/1",
+                           /*ModificationTime=*/{});
+
+  TestAST AST(Inputs);
+  std::vector<Decl *> DeclsInTU;
+  for (auto *D : AST.context().getTranslationUnitDecl()->decls())
+    DeclsInTU.push_back(D);
+  auto Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor());
+  // Check that we're spelling header using the symlink, and not underlying
+  // path.
+  EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\""));
+  // header.h should be unused.
+  EXPECT_THAT(Results.Unused, Not(testing::IsEmpty()));
+
+  {
+    // Make sure filtering is also applied to symlink, not underlying file.
+    auto HeaderFilter = [](llvm::StringRef Path) { return Path == "inner.h"; };
+    Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor(),
+                      HeaderFilter);
+    EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\""));
+    // header.h should be unused.
+    EXPECT_THAT(Results.Unused, Not(testing::IsEmpty()));
+  }
+  {
+    auto HeaderFilter = [](llvm::StringRef Path) { return Path == "header.h"; };
+    Results = analyze(DeclsInTU, {}, PP.Includes, &PI, AST.preprocessor(),
+                      HeaderFilter);
+    // header.h should be ignored now.
+    EXPECT_THAT(Results.Unused, Not(testing::IsEmpty()));
+    EXPECT_THAT(Results.Missing, testing::ElementsAre("\"inner.h\""));
+  }
+}
+
 TEST(FixIncludes, Basic) {
   llvm::StringRef Code = R"cpp(#include "d.h"
 #include "a.h"
diff --git a/clang-tools-extra/include-cleaner/unittests/IncludeSpellerTest.cpp b/clang-tools-extra/include-cleaner/unittests/IncludeSpellerTest.cpp
index 8f6ad09c46cc4a..a27e83a434372f 100644
--- a/clang-tools-extra/include-cleaner/unittests/IncludeSpellerTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/IncludeSpellerTest.cpp
@@ -47,8 +47,7 @@ class DummyIncludeSpeller : public IncludeSpeller {
       return "<bits/stdc++.h>";
     if (Input.H.kind() != Header::Physical)
       return "";
-    llvm::StringRef AbsolutePath =
-        Input.H.physical().getFileEntry().tryGetRealPathName();
+    llvm::StringRef AbsolutePath = Input.H.resolvedPath();
     std::string RootWithSeparator{testRoot()};
     RootWithSeparator += llvm::sys::path::get_separator();
     if (!AbsolutePath.consume_front(llvm::StringRef{RootWithSeparator}))
diff --git a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
index 1a5996e5df284b..715d95eb573464 100644
--- a/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/RecordTest.cpp
@@ -10,6 +10,7 @@
 #include "clang-include-cleaner/Types.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/FileEntry.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -24,6 +25,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Testing/Annotations/Annotations.h"
@@ -53,9 +55,11 @@ MATCHER_P(named, N, "") {
 }
 
 MATCHER_P(FileNamed, N, "") {
-  if (arg.getFileEntry().tryGetRealPathName() == N)
+  llvm::StringRef ActualName =
+      llvm::sys::path::remove_leading_dotslash(arg.getName());
+  if (ActualName == N)
     return true;
-  *result_listener << arg.getFileEntry().tryGetRealPathName().str();
+  *result_listener << ActualName.str();
   return false;
 }
 
@@ -317,7 +321,8 @@ class PragmaIncludeTest : public ::testing::Test {
   }
 
   TestAST build(bool ResetPragmaIncludes = true) {
-    if (ResetPragmaIncludes) PI = PragmaIncludes();
+    if (ResetPragmaIncludes)
+      PI = PragmaIncludes();
     return TestAST(Inputs);
   }
 
@@ -535,16 +540,33 @@ TEST_F(PragmaIncludeTest, IWYUExportBlock) {
   TestAST Processed = build();
   auto &FM = Processed.fileManager();
 
-  EXPECT_THAT(PI.getExporters(FM.getFile("private1.h").get(), FM),
-              testing::UnorderedElementsAre(FileNamed("export1.h"),
-                                            FileNamed("normal.h")));
-  EXPECT_THAT(PI.getExporters(FM.getFile("private2.h").get(), FM),
-              testing::UnorderedElementsAre(FileNamed("export1.h")));
-  EXPECT_THAT(PI.getExporters(FM.getFile("private3.h").get(), FM),
-              testing::UnorderedElementsAre(FileNamed("export1.h")));
-
-  EXPECT_TRUE(PI.getExporters(FM.getFile("foo.h").get(), FM).empty());
-  EXPECT_TRUE(PI.getExporters(FM.getFile("bar.h").get(), FM).empty());
+  auto GetNames = [](llvm::ArrayRef<FileEntryRef> FEs) {
+    std::string Result;
+    llvm::raw_string_ostream OS(Result);
+    for (auto &FE : FEs) {
+      OS << FE.getName() << " ";
+    }
+    OS.flush();
+    return Result;
+  };
+  auto Exporters = PI.getExporters(FM.getFile("private1.h").get(), FM);
+  EXPECT_THAT(Exporters, testing::UnorderedElementsAre(FileNamed("export1.h"),
+                                                       FileNamed("normal.h")))
+      << GetNames(Exporters);
+
+  Exporters = PI.getExporters(FM.getFile("private2.h").get(), FM);
+  EXPECT_THAT(Exporters, testing::UnorderedElementsAre(FileNamed("export1.h")))
+      << GetNames(Exporters);
+
+  Exporters = PI.getExporters(FM.getFile("private3.h").get(), FM);
+  EXPECT_THAT(Exporters, testing::UnorderedElementsAre(FileNamed("export1.h")))
+      << GetNames(Exporters);
+
+  Exporters = PI.getExporters(FM.getFile("foo.h").get(), FM);
+  EXPECT_TRUE(Exporters.empty()) << GetNames(Exporters);
+
+  Exporters = PI.getExporters(FM.getFile("bar.h").get(), FM);
+  EXPECT_TRUE(Exporters.empty()) << GetNames(Exporters);
 }
 
 TEST_F(PragmaIncludeTest, SelfContained) {
diff --git a/clang/docs/APINotes.rst b/clang/docs/APINotes.rst
index bc09b16bab5d27..dcefa6810dac67 100644
--- a/clang/docs/APINotes.rst
+++ b/clang/docs/APINotes.rst
@@ -188,6 +188,18 @@ declaration kind), all of which are optional:
     - Name: tzdb
       SwiftCopyable: false
 
+:SwiftConformsTo:
+
+  Allows annotating a C++ class as conforming to a Swift protocol. Equivalent
+  to ``SWIFT_CONFORMS_TO_PROTOCOL``. The value is a module-qualified name of a
+  Swift protocol.
+
+  ::
+
+    Tags:
+    - Name: vector
+      SwiftConformsTo: Cxx.CxxSequence
+
 :Availability, AvailabilityMsg:
 
   A value of "nonswift" is equivalent to ``NS_SWIFT_UNAVAILABLE``. A value of
diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 2478a77e7640c5..ccc0cb59f8e710 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -1230,6 +1230,58 @@ parsing their headers, those should be included after the import. If the
 imported modules don't provide such a header, one can be made manually for
 improved compile time performance.
 
+Reachability of internal partition units
+----------------------------------------
+
+The internal partition units are sometimes called implementation partition units in other documentation.
+However, the name may be confusing since implementation partition units are not implementation
+units.
+
+According to `[module.reach]p1 <https://eel.is/c++draft/module.reach#1>`_ and
+`[module.reach]p2 <https://eel.is/c++draft/module.reach#2>`_ (from N4986):
+
+  A translation unit U is necessarily reachable from a point P if U is a module
+  interface unit on which the translation unit containing P has an interface
+  dependency, or the translation unit containing P imports U, in either case
+  prior to P.
+
+  All translation units that are necessarily reachable are reachable. Additional
+  translation units on which the point within the program has an interface
+  dependency may be considered reachable, but it is unspecified which are and
+  under what circumstances.
+
+For example,
+
+.. code-block:: c++
+
+  // a.cpp
+  import B;
+  int main()
+  {
+      g<void>();
+  }
+
+  // b.cppm
+  export module B;
+  import :C;
+  export template <typename T> inline void g() noexcept
+  {
+      return f<T>();
+  }
+
+  // c.cppm
+  module B:C;
+  template<typename> inline void f() noexcept {}
+
+The internal partition unit ``c.cppm`` is not necessarily reachable by
+``a.cpp`` because ``c.cppm`` is not a module interface unit and ``a.cpp``
+doesn't import ``c.cppm``. This leaves it up to the compiler to decide if
+``c.cppm`` is reachable by ``a.cpp`` or not. Clang's behavior is that
+indirectly imported internal partition units are not reachable.
+
+The suggested approach for using an internal partition unit in Clang is
+to only import them in the implementation unit.
+
 Known Issues
 ------------
 
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index c8e5e4df25d173..f972d0cf26640d 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -685,6 +685,9 @@ class TagInfo : public CommonTypeInfo {
   std::optional<std::string> SwiftRetainOp;
   std::optional<std::string> SwiftReleaseOp;
 
+  /// The Swift protocol that this type should be automatically conformed to.
+  std::optional<std::string> SwiftConformance;
+
   std::optional<EnumExtensibilityKind> EnumExtensibility;
 
   TagInfo()
@@ -720,6 +723,9 @@ class TagInfo : public CommonTypeInfo {
     if (!SwiftReleaseOp)
       SwiftReleaseOp = RHS.SwiftReleaseOp;
 
+    if (!SwiftConformance)
+      SwiftConformance = RHS.SwiftConformance;
+
     if (!HasFlagEnum)
       setFlagEnum(RHS.isFlagEnum());
 
@@ -742,6 +748,7 @@ inline bool operator==(const TagInfo &LHS, const TagInfo &RHS) {
          LHS.SwiftImportAs == RHS.SwiftImportAs &&
          LHS.SwiftRetainOp == RHS.SwiftRetainOp &&
          LHS.SwiftReleaseOp == RHS.SwiftReleaseOp &&
+         LHS.SwiftConformance == RHS.SwiftConformance &&
          LHS.isFlagEnum() == RHS.isFlagEnum() &&
          LHS.isSwiftCopyable() == RHS.isSwiftCopyable() &&
          LHS.EnumExtensibility == RHS.EnumExtensibility;
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 04dbd1db6cba81..ee662ed73d7e0e 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -324,6 +324,7 @@ class alignas(8) Decl {
   static bool StatisticsEnabled;
 
 protected:
+  friend class ASTDeclMerger;
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
   friend class ASTNodeImporter;
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index bf6a5ce92d438d..0d72cc6a08dcb4 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -256,6 +256,7 @@ class CXXBaseSpecifier {
 
 /// Represents a C++ struct/union/class.
 class CXXRecordDecl : public RecordDecl {
+  friend class ASTDeclMerger;
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
   friend class ASTNodeImporter;
diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h
index d2cc61ca19f8a5..1cda70530d7d83 100644
--- a/clang/include/clang/AST/DeclObjC.h
+++ b/clang/include/clang/AST/DeclObjC.h
@@ -1747,6 +1747,7 @@ class ObjCInterfaceDecl : public ObjCContainerDecl
   static bool isKnownExtension(ObjCCategoryDecl *Cat);
 
 public:
+  friend class ASTDeclMerger;
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
   friend class ASTReader;
@@ -2134,6 +2135,7 @@ class ObjCProtocolDecl : public ObjCContainerDecl,
   void setHasODRHash(bool HasHash);
 
 public:
+  friend class ASTDeclMerger;
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
   friend class ASTReader;
diff --git a/clang/include/clang/AST/Redeclarable.h b/clang/include/clang/AST/Redeclarable.h
index 74ccd74ed60d6b..8d320a9ced2792 100644
--- a/clang/include/clang/AST/Redeclarable.h
+++ b/clang/include/clang/AST/Redeclarable.h
@@ -191,6 +191,7 @@ class Redeclarable {
   }
 
 public:
+  friend class ASTDeclMerger;
   friend class ASTDeclReader;
   friend class ASTDeclWriter;
   friend class IncrementalParser;
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 39dd1f515c9eb3..88d5535829910f 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -410,6 +410,7 @@ class TextNodeDumper
   void VisitOpenACCConstructStmt(const OpenACCConstructStmt *S);
   void VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S);
   void VisitEmbedExpr(const EmbedExpr *S);
+  void VisitAtomicExpr(const AtomicExpr *AE);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 54e689a7a42213..ca9c00a1473bbd 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -298,6 +298,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
 LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
 LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
 LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
+LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
 
 LANGOPT(SYCLIsDevice      , 1, 0, "Generate code for SYCL device")
 LANGOPT(SYCLIsHost        , 1, 0, "SYCL host compilation")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 0b38139bd27972..75320cafaefa5f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1302,6 +1302,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
 def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
   Flags<[HelpHidden]>,
   HelpText<"Compression level for offload device binaries (HIP only)">;
+
+defm offload_via_llvm : BoolFOption<"offload-via-llvm",
+  LangOpts<"OffloadViaLLVM">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
+  NegFlag<SetFalse, [], [ClangOption], "Don't use">,
+  BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
 }
 
 // CUDA options
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index a0e90e62bd60ec..4593213c5f43ce 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -378,6 +378,7 @@ class ASTReader
 {
 public:
   /// Types of AST files.
+  friend class ASTDeclMerger;
   friend class ASTDeclReader;
   friend class ASTIdentifierIterator;
   friend class ASTRecordReader;
diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
index f7b4510d7f7beb..d12814e7c9253e 100644
--- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
+++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h
@@ -353,6 +353,12 @@ class DependencyScanningWorkerFilesystem
 
   std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
 
+  /// Make it so that no paths bypass this VFS.
+  void resetBypassedPathPrefix() { BypassedPathPrefix.reset(); }
+  /// Set the prefix for paths that should bypass this VFS and go straight to
+  /// the underlying VFS.
+  void setBypassedPathPrefix(StringRef Prefix) { BypassedPathPrefix = Prefix; }
+
   /// Returns entry for the given filename.
   ///
   /// Attempts to use the local and shared caches first, then falls back to
@@ -450,12 +456,19 @@ class DependencyScanningWorkerFilesystem
     getUnderlyingFS().print(OS, Type, IndentLevel + 1);
   }
 
+  /// Whether this path should bypass this VFS and go straight to the underlying
+  /// VFS.
+  bool shouldBypass(StringRef Path) const;
+
   /// The global cache shared between worker threads.
   DependencyScanningFilesystemSharedCache &SharedCache;
   /// The local cache is used by the worker thread to cache file system queries
   /// locally instead of querying the global cache every time.
   DependencyScanningFilesystemLocalCache LocalCache;
 
+  /// Prefix of paths that should go straight to the underlying VFS.
+  std::optional<std::string> BypassedPathPrefix;
+
   /// The working directory to use for making relative paths absolute before
   /// using them for cache lookups.
   llvm::ErrorOr<std::string> WorkingDirForCacheLookup;
diff --git a/clang/lib/APINotes/APINotesFormat.h b/clang/lib/APINotes/APINotesFormat.h
index 9d254dcc1c9eff..fba5f4e8907dae 100644
--- a/clang/lib/APINotes/APINotesFormat.h
+++ b/clang/lib/APINotes/APINotesFormat.h
@@ -24,7 +24,7 @@ const uint16_t VERSION_MAJOR = 0;
 /// API notes file minor version number.
 ///
 /// When the format changes IN ANY WAY, this number should be incremented.
-const uint16_t VERSION_MINOR = 28; // nested tags
+const uint16_t VERSION_MINOR = 29; // SwiftConformsTo
 
 const uint8_t kSwiftCopyable = 1;
 const uint8_t kSwiftNonCopyable = 2;
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index 871f782511d5f1..c05fdffe4a071b 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -572,6 +572,12 @@ class TagTableInfo
                                         ReleaseOpLength - 1);
       Data += ReleaseOpLength - 1;
     }
+    if (unsigned ConformanceLength =
+            endian::readNext<uint16_t, llvm::endianness::little>(Data)) {
+      Info.SwiftConformance = std::string(reinterpret_cast<const char *>(Data),
+                                          ConformanceLength - 1);
+      Data += ConformanceLength - 1;
+    }
 
     ReadCommonTypeInfo(Data, Info);
     return Info;
diff --git a/clang/lib/APINotes/APINotesWriter.cpp b/clang/lib/APINotes/APINotesWriter.cpp
index 2a71922746ac5d..cf3a0bee393eee 100644
--- a/clang/lib/APINotes/APINotesWriter.cpp
+++ b/clang/lib/APINotes/APINotesWriter.cpp
@@ -1189,6 +1189,7 @@ class TagTableInfo : public CommonTypeTableInfo<TagTableInfo, TagInfo> {
     return 2 + (TI.SwiftImportAs ? TI.SwiftImportAs->size() : 0) +
            2 + (TI.SwiftRetainOp ? TI.SwiftRetainOp->size() : 0) +
            2 + (TI.SwiftReleaseOp ? TI.SwiftReleaseOp->size() : 0) +
+           2 + (TI.SwiftConformance ? TI.SwiftConformance->size() : 0) +
            2 + getCommonTypeInfoSize(TI);
   }
 
@@ -1230,6 +1231,12 @@ class TagTableInfo : public CommonTypeTableInfo<TagTableInfo, TagInfo> {
     } else {
       writer.write<uint16_t>(0);
     }
+    if (auto Conformance = TI.SwiftConformance) {
+      writer.write<uint16_t>(Conformance->size() + 1);
+      OS.write(Conformance->c_str(), Conformance->size());
+    } else {
+      writer.write<uint16_t>(0);
+    }
 
     emitCommonTypeInfo(OS, TI);
   }
diff --git a/clang/lib/APINotes/APINotesYAMLCompiler.cpp b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
index 11cccc94a15f03..2205686c4d15c3 100644
--- a/clang/lib/APINotes/APINotesYAMLCompiler.cpp
+++ b/clang/lib/APINotes/APINotesYAMLCompiler.cpp
@@ -419,6 +419,7 @@ struct Tag {
   std::optional<std::string> SwiftImportAs;
   std::optional<std::string> SwiftRetainOp;
   std::optional<std::string> SwiftReleaseOp;
+  std::optional<std::string> SwiftConformance;
   std::optional<EnumExtensibilityKind> EnumExtensibility;
   std::optional<bool> FlagEnum;
   std::optional<EnumConvenienceAliasKind> EnumConvenienceKind;
@@ -456,6 +457,7 @@ template <> struct MappingTraits<Tag> {
     IO.mapOptional("SwiftImportAs", T.SwiftImportAs);
     IO.mapOptional("SwiftReleaseOp", T.SwiftReleaseOp);
     IO.mapOptional("SwiftRetainOp", T.SwiftRetainOp);
+    IO.mapOptional("SwiftConformsTo", T.SwiftConformance);
     IO.mapOptional("EnumExtensibility", T.EnumExtensibility);
     IO.mapOptional("FlagEnum", T.FlagEnum);
     IO.mapOptional("EnumKind", T.EnumConvenienceKind);
@@ -920,6 +922,8 @@ class YAMLConverter {
       TI.SwiftRetainOp = T.SwiftRetainOp;
     if (T.SwiftReleaseOp)
       TI.SwiftReleaseOp = T.SwiftReleaseOp;
+    if (T.SwiftConformance)
+      TI.SwiftConformance = T.SwiftConformance;
 
     if (T.SwiftCopyable)
       TI.setSwiftCopyable(T.SwiftCopyable);
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index 70aecb781c2ff2..44d944d4e948cb 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -66,6 +66,7 @@ add_clang_library(clangAST
   InheritViz.cpp
   Interp/ByteCodeEmitter.cpp
   Interp/Compiler.cpp
+  Interp/CompilerComplex.cpp
   Interp/Context.cpp
   Interp/Descriptor.cpp
   Interp/Disasm.cpp
diff --git a/clang/lib/AST/Interp/Compiler.cpp b/clang/lib/AST/Interp/Compiler.cpp
index dd24cff1bab46e..3b42590a61eb50 100644
--- a/clang/lib/AST/Interp/Compiler.cpp
+++ b/clang/lib/AST/Interp/Compiler.cpp
@@ -982,234 +982,6 @@ bool Compiler<Emitter>::VisitLogicalBinOp(const BinaryOperator *E) {
   return true;
 }
 
-template <class Emitter>
-bool Compiler<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
-  // Prepare storage for result.
-  if (!Initializing) {
-    std::optional<unsigned> LocalIndex = allocateLocal(E);
-    if (!LocalIndex)
-      return false;
-    if (!this->emitGetPtrLocal(*LocalIndex, E))
-      return false;
-  }
-
-  // Both LHS and RHS might _not_ be of complex type, but one of them
-  // needs to be.
-  const Expr *LHS = E->getLHS();
-  const Expr *RHS = E->getRHS();
-
-  PrimType ResultElemT = this->classifyComplexElementType(E->getType());
-  unsigned ResultOffset = ~0u;
-  if (!DiscardResult)
-    ResultOffset = this->allocateLocalPrimitive(E, PT_Ptr, true, false);
-
-  // Save result pointer in ResultOffset
-  if (!this->DiscardResult) {
-    if (!this->emitDupPtr(E))
-      return false;
-    if (!this->emitSetLocal(PT_Ptr, ResultOffset, E))
-      return false;
-  }
-  QualType LHSType = LHS->getType();
-  if (const auto *AT = LHSType->getAs<AtomicType>())
-    LHSType = AT->getValueType();
-  QualType RHSType = RHS->getType();
-  if (const auto *AT = RHSType->getAs<AtomicType>())
-    RHSType = AT->getValueType();
-
-  bool LHSIsComplex = LHSType->isAnyComplexType();
-  unsigned LHSOffset;
-  bool RHSIsComplex = RHSType->isAnyComplexType();
-
-  // For ComplexComplex Mul, we have special ops to make their implementation
-  // easier.
-  BinaryOperatorKind Op = E->getOpcode();
-  if (Op == BO_Mul && LHSIsComplex && RHSIsComplex) {
-    assert(classifyPrim(LHSType->getAs<ComplexType>()->getElementType()) ==
-           classifyPrim(RHSType->getAs<ComplexType>()->getElementType()));
-    PrimType ElemT =
-        classifyPrim(LHSType->getAs<ComplexType>()->getElementType());
-    if (!this->visit(LHS))
-      return false;
-    if (!this->visit(RHS))
-      return false;
-    return this->emitMulc(ElemT, E);
-  }
-
-  if (Op == BO_Div && RHSIsComplex) {
-    QualType ElemQT = RHSType->getAs<ComplexType>()->getElementType();
-    PrimType ElemT = classifyPrim(ElemQT);
-    // If the LHS is not complex, we still need to do the full complex
-    // division, so just stub create a complex value and stub it out with
-    // the LHS and a zero.
-
-    if (!LHSIsComplex) {
-      // This is using the RHS type for the fake-complex LHS.
-      if (auto LHSO = allocateLocal(RHS))
-        LHSOffset = *LHSO;
-      else
-        return false;
-
-      if (!this->emitGetPtrLocal(LHSOffset, E))
-        return false;
-
-      if (!this->visit(LHS))
-        return false;
-      // real is LHS
-      if (!this->emitInitElem(ElemT, 0, E))
-        return false;
-      // imag is zero
-      if (!this->visitZeroInitializer(ElemT, ElemQT, E))
-        return false;
-      if (!this->emitInitElem(ElemT, 1, E))
-        return false;
-    } else {
-      if (!this->visit(LHS))
-        return false;
-    }
-
-    if (!this->visit(RHS))
-      return false;
-    return this->emitDivc(ElemT, E);
-  }
-
-  // Evaluate LHS and save value to LHSOffset.
-  if (LHSType->isAnyComplexType()) {
-    LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
-    if (!this->visit(LHS))
-      return false;
-    if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
-      return false;
-  } else {
-    PrimType LHST = classifyPrim(LHSType);
-    LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
-    if (!this->visit(LHS))
-      return false;
-    if (!this->emitSetLocal(LHST, LHSOffset, E))
-      return false;
-  }
-
-  // Same with RHS.
-  unsigned RHSOffset;
-  if (RHSType->isAnyComplexType()) {
-    RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
-    if (!this->visit(RHS))
-      return false;
-    if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
-      return false;
-  } else {
-    PrimType RHST = classifyPrim(RHSType);
-    RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
-    if (!this->visit(RHS))
-      return false;
-    if (!this->emitSetLocal(RHST, RHSOffset, E))
-      return false;
-  }
-
-  // For both LHS and RHS, either load the value from the complex pointer, or
-  // directly from the local variable. For index 1 (i.e. the imaginary part),
-  // just load 0 and do the operation anyway.
-  auto loadComplexValue = [this](bool IsComplex, bool LoadZero,
-                                 unsigned ElemIndex, unsigned Offset,
-                                 const Expr *E) -> bool {
-    if (IsComplex) {
-      if (!this->emitGetLocal(PT_Ptr, Offset, E))
-        return false;
-      return this->emitArrayElemPop(classifyComplexElementType(E->getType()),
-                                    ElemIndex, E);
-    }
-    if (ElemIndex == 0 || !LoadZero)
-      return this->emitGetLocal(classifyPrim(E->getType()), Offset, E);
-    return this->visitZeroInitializer(classifyPrim(E->getType()), E->getType(),
-                                      E);
-  };
-
-  // Now we can get pointers to the LHS and RHS from the offsets above.
-  for (unsigned ElemIndex = 0; ElemIndex != 2; ++ElemIndex) {
-    // Result pointer for the store later.
-    if (!this->DiscardResult) {
-      if (!this->emitGetLocal(PT_Ptr, ResultOffset, E))
-        return false;
-    }
-
-    // The actual operation.
-    switch (Op) {
-    case BO_Add:
-      if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS))
-        return false;
-
-      if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS))
-        return false;
-      if (ResultElemT == PT_Float) {
-        if (!this->emitAddf(getRoundingMode(E), E))
-          return false;
-      } else {
-        if (!this->emitAdd(ResultElemT, E))
-          return false;
-      }
-      break;
-    case BO_Sub:
-      if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS))
-        return false;
-
-      if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS))
-        return false;
-      if (ResultElemT == PT_Float) {
-        if (!this->emitSubf(getRoundingMode(E), E))
-          return false;
-      } else {
-        if (!this->emitSub(ResultElemT, E))
-          return false;
-      }
-      break;
-    case BO_Mul:
-      if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS))
-        return false;
-
-      if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS))
-        return false;
-
-      if (ResultElemT == PT_Float) {
-        if (!this->emitMulf(getRoundingMode(E), E))
-          return false;
-      } else {
-        if (!this->emitMul(ResultElemT, E))
-          return false;
-      }
-      break;
-    case BO_Div:
-      assert(!RHSIsComplex);
-      if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS))
-        return false;
-
-      if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS))
-        return false;
-
-      if (ResultElemT == PT_Float) {
-        if (!this->emitDivf(getRoundingMode(E), E))
-          return false;
-      } else {
-        if (!this->emitDiv(ResultElemT, E))
-          return false;
-      }
-      break;
-
-    default:
-      return false;
-    }
-
-    if (!this->DiscardResult) {
-      // Initialize array element with the value we just computed.
-      if (!this->emitInitElemPop(ResultElemT, ElemIndex, E))
-        return false;
-    } else {
-      if (!this->emitPop(ResultElemT, E))
-        return false;
-    }
-  }
-  return true;
-}
-
 template <class Emitter>
 bool Compiler<Emitter>::VisitImplicitValueInitExpr(
     const ImplicitValueInitExpr *E) {
@@ -4856,6 +4628,50 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
   return this->emitRetVoid(SourceInfo{});
 }
 
+template <class Emitter>
+bool Compiler<Emitter>::compileDestructor(const CXXDestructorDecl *Dtor) {
+  const RecordDecl *RD = Dtor->getParent();
+  const Record *R = this->getRecord(RD);
+  if (!R)
+    return false;
+
+  if (!Dtor->isTrivial() && Dtor->getBody()) {
+    if (!this->visitStmt(Dtor->getBody()))
+      return false;
+  }
+
+  if (!this->emitThis(Dtor))
+    return false;
+
+  assert(R);
+  if (!R->isUnion()) {
+    // First, destroy all fields.
+    for (const Record::Field &Field : llvm::reverse(R->fields())) {
+      const Descriptor *D = Field.Desc;
+      if (!D->isPrimitive() && !D->isPrimitiveArray()) {
+        if (!this->emitGetPtrField(Field.Offset, SourceInfo{}))
+          return false;
+        if (!this->emitDestruction(D))
+          return false;
+        if (!this->emitPopPtr(SourceInfo{}))
+          return false;
+      }
+    }
+  }
+
+  for (const Record::Base &Base : llvm::reverse(R->bases())) {
+    if (!this->emitGetPtrBase(Base.Offset, SourceInfo{}))
+      return false;
+    if (!this->emitRecordDestruction(Base.R))
+      return false;
+    if (!this->emitPopPtr(SourceInfo{}))
+      return false;
+  }
+
+  // FIXME: Virtual bases.
+  return this->emitPopPtr(Dtor) && this->emitRetVoid(Dtor);
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
   // Classify the return type.
@@ -4863,6 +4679,8 @@ bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
 
   if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(F))
     return this->compileConstructor(Ctor);
+  if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(F))
+    return this->compileDestructor(Dtor);
 
   // Emit custom code if this is a lambda static invoker.
   if (const auto *MD = dyn_cast<CXXMethodDecl>(F);
@@ -5099,113 +4917,6 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
   return false;
 }
 
-template <class Emitter>
-bool Compiler<Emitter>::VisitComplexUnaryOperator(const UnaryOperator *E) {
-  const Expr *SubExpr = E->getSubExpr();
-  assert(SubExpr->getType()->isAnyComplexType());
-
-  if (DiscardResult)
-    return this->discard(SubExpr);
-
-  std::optional<PrimType> ResT = classify(E);
-  auto prepareResult = [=]() -> bool {
-    if (!ResT && !Initializing) {
-      std::optional<unsigned> LocalIndex = allocateLocal(SubExpr);
-      if (!LocalIndex)
-        return false;
-      return this->emitGetPtrLocal(*LocalIndex, E);
-    }
-
-    return true;
-  };
-
-  // The offset of the temporary, if we created one.
-  unsigned SubExprOffset = ~0u;
-  auto createTemp = [=, &SubExprOffset]() -> bool {
-    SubExprOffset = this->allocateLocalPrimitive(SubExpr, PT_Ptr, true, false);
-    if (!this->visit(SubExpr))
-      return false;
-    return this->emitSetLocal(PT_Ptr, SubExprOffset, E);
-  };
-
-  PrimType ElemT = classifyComplexElementType(SubExpr->getType());
-  auto getElem = [=](unsigned Offset, unsigned Index) -> bool {
-    if (!this->emitGetLocal(PT_Ptr, Offset, E))
-      return false;
-    return this->emitArrayElemPop(ElemT, Index, E);
-  };
-
-  switch (E->getOpcode()) {
-  case UO_Minus:
-    if (!prepareResult())
-      return false;
-    if (!createTemp())
-      return false;
-    for (unsigned I = 0; I != 2; ++I) {
-      if (!getElem(SubExprOffset, I))
-        return false;
-      if (!this->emitNeg(ElemT, E))
-        return false;
-      if (!this->emitInitElem(ElemT, I, E))
-        return false;
-    }
-    break;
-
-  case UO_Plus:   // +x
-  case UO_AddrOf: // &x
-  case UO_Deref:  // *x
-    return this->delegate(SubExpr);
-
-  case UO_LNot:
-    if (!this->visit(SubExpr))
-      return false;
-    if (!this->emitComplexBoolCast(SubExpr))
-      return false;
-    if (!this->emitInvBool(E))
-      return false;
-    if (PrimType ET = classifyPrim(E->getType()); ET != PT_Bool)
-      return this->emitCast(PT_Bool, ET, E);
-    return true;
-
-  case UO_Real:
-    return this->emitComplexReal(SubExpr);
-
-  case UO_Imag:
-    if (!this->visit(SubExpr))
-      return false;
-
-    if (SubExpr->isLValue()) {
-      if (!this->emitConstUint8(1, E))
-        return false;
-      return this->emitArrayElemPtrPopUint8(E);
-    }
-
-    // Since our _Complex implementation does not map to a primitive type,
-    // we sometimes have to do the lvalue-to-rvalue conversion here manually.
-    return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E);
-
-  case UO_Not: // ~x
-    if (!this->visit(SubExpr))
-      return false;
-    // Negate the imaginary component.
-    if (!this->emitArrayElem(ElemT, 1, E))
-      return false;
-    if (!this->emitNeg(ElemT, E))
-      return false;
-    if (!this->emitInitElem(ElemT, 1, E))
-      return false;
-    return DiscardResult ? this->emitPopPtr(E) : true;
-
-  case UO_Extension:
-    return this->delegate(SubExpr);
-
-  default:
-    return this->emitInvalid(E);
-  }
-
-  return true;
-}
-
 template <class Emitter>
 bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
   if (DiscardResult)
@@ -5405,214 +5116,25 @@ bool Compiler<Emitter>::emitPrimCast(PrimType FromT, PrimType ToT,
   return false;
 }
 
-/// Emits __real(SubExpr)
-template <class Emitter>
-bool Compiler<Emitter>::emitComplexReal(const Expr *SubExpr) {
-  assert(SubExpr->getType()->isAnyComplexType());
-
-  if (DiscardResult)
-    return this->discard(SubExpr);
-
-  if (!this->visit(SubExpr))
-    return false;
-  if (SubExpr->isLValue()) {
-    if (!this->emitConstUint8(0, SubExpr))
-      return false;
-    return this->emitArrayElemPtrPopUint8(SubExpr);
-  }
-
-  // Rvalue, load the actual element.
-  return this->emitArrayElemPop(classifyComplexElementType(SubExpr->getType()),
-                                0, SubExpr);
-}
-
-template <class Emitter>
-bool Compiler<Emitter>::emitComplexBoolCast(const Expr *E) {
-  assert(!DiscardResult);
-  PrimType ElemT = classifyComplexElementType(E->getType());
-  // We emit the expression (__real(E) != 0 || __imag(E) != 0)
-  // for us, that means (bool)E[0] || (bool)E[1]
-  if (!this->emitArrayElem(ElemT, 0, E))
-    return false;
-  if (ElemT == PT_Float) {
-    if (!this->emitCastFloatingIntegral(PT_Bool, E))
-      return false;
-  } else {
-    if (!this->emitCast(ElemT, PT_Bool, E))
-      return false;
-  }
-
-  // We now have the bool value of E[0] on the stack.
-  LabelTy LabelTrue = this->getLabel();
-  if (!this->jumpTrue(LabelTrue))
-    return false;
-
-  if (!this->emitArrayElemPop(ElemT, 1, E))
-    return false;
-  if (ElemT == PT_Float) {
-    if (!this->emitCastFloatingIntegral(PT_Bool, E))
-      return false;
-  } else {
-    if (!this->emitCast(ElemT, PT_Bool, E))
-      return false;
-  }
-  // Leave the boolean value of E[1] on the stack.
-  LabelTy EndLabel = this->getLabel();
-  this->jump(EndLabel);
-
-  this->emitLabel(LabelTrue);
-  if (!this->emitPopPtr(E))
-    return false;
-  if (!this->emitConstBool(true, E))
-    return false;
-
-  this->fallthrough(EndLabel);
-  this->emitLabel(EndLabel);
-
-  return true;
-}
-
-template <class Emitter>
-bool Compiler<Emitter>::emitComplexComparison(const Expr *LHS, const Expr *RHS,
-                                              const BinaryOperator *E) {
-  assert(E->isComparisonOp());
-  assert(!Initializing);
-  assert(!DiscardResult);
-
-  PrimType ElemT;
-  bool LHSIsComplex;
-  unsigned LHSOffset;
-  if (LHS->getType()->isAnyComplexType()) {
-    LHSIsComplex = true;
-    ElemT = classifyComplexElementType(LHS->getType());
-    LHSOffset = allocateLocalPrimitive(LHS, PT_Ptr, /*IsConst=*/true,
-                                       /*IsExtended=*/false);
-    if (!this->visit(LHS))
-      return false;
-    if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
-      return false;
-  } else {
-    LHSIsComplex = false;
-    PrimType LHST = classifyPrim(LHS->getType());
-    LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
-    if (!this->visit(LHS))
-      return false;
-    if (!this->emitSetLocal(LHST, LHSOffset, E))
-      return false;
-  }
-
-  bool RHSIsComplex;
-  unsigned RHSOffset;
-  if (RHS->getType()->isAnyComplexType()) {
-    RHSIsComplex = true;
-    ElemT = classifyComplexElementType(RHS->getType());
-    RHSOffset = allocateLocalPrimitive(RHS, PT_Ptr, /*IsConst=*/true,
-                                       /*IsExtended=*/false);
-    if (!this->visit(RHS))
-      return false;
-    if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
-      return false;
-  } else {
-    RHSIsComplex = false;
-    PrimType RHST = classifyPrim(RHS->getType());
-    RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
-    if (!this->visit(RHS))
-      return false;
-    if (!this->emitSetLocal(RHST, RHSOffset, E))
-      return false;
-  }
-
-  auto getElem = [&](unsigned LocalOffset, unsigned Index,
-                     bool IsComplex) -> bool {
-    if (IsComplex) {
-      if (!this->emitGetLocal(PT_Ptr, LocalOffset, E))
-        return false;
-      return this->emitArrayElemPop(ElemT, Index, E);
-    }
-    return this->emitGetLocal(ElemT, LocalOffset, E);
-  };
-
-  for (unsigned I = 0; I != 2; ++I) {
-    // Get both values.
-    if (!getElem(LHSOffset, I, LHSIsComplex))
-      return false;
-    if (!getElem(RHSOffset, I, RHSIsComplex))
-      return false;
-    // And compare them.
-    if (!this->emitEQ(ElemT, E))
-      return false;
-
-    if (!this->emitCastBoolUint8(E))
-      return false;
-  }
-
-  // We now have two bool values on the stack. Compare those.
-  if (!this->emitAddUint8(E))
-    return false;
-  if (!this->emitConstUint8(2, E))
-    return false;
-
-  if (E->getOpcode() == BO_EQ) {
-    if (!this->emitEQUint8(E))
-      return false;
-  } else if (E->getOpcode() == BO_NE) {
-    if (!this->emitNEUint8(E))
-      return false;
-  } else
-    return false;
-
-  // In C, this returns an int.
-  if (PrimType ResT = classifyPrim(E->getType()); ResT != PT_Bool)
-    return this->emitCast(PT_Bool, ResT, E);
-  return true;
-}
-
 /// When calling this, we have a pointer of the local-to-destroy
 /// on the stack.
 /// Emit destruction of record types (or arrays of record types).
 template <class Emitter>
 bool Compiler<Emitter>::emitRecordDestruction(const Record *R) {
   assert(R);
-  if (!R->isUnion()) {
-    // First, destroy all fields.
-    for (const Record::Field &Field : llvm::reverse(R->fields())) {
-      const Descriptor *D = Field.Desc;
-      if (!D->isPrimitive() && !D->isPrimitiveArray()) {
-        if (!this->emitGetPtrField(Field.Offset, SourceInfo{}))
-          return false;
-        if (!this->emitDestruction(D))
-          return false;
-        if (!this->emitPopPtr(SourceInfo{}))
-          return false;
-      }
-    }
-  }
-
-  // Now emit the destructor and recurse into base classes.
-  if (const CXXDestructorDecl *Dtor = R->getDestructor();
-      Dtor && !Dtor->isTrivial()) {
-    const Function *DtorFunc = getFunction(Dtor);
-    if (!DtorFunc)
-      return false;
-    assert(DtorFunc->hasThisPointer());
-    assert(DtorFunc->getNumParams() == 1);
-    if (!this->emitDupPtr(SourceInfo{}))
-      return false;
-    if (!this->emitCall(DtorFunc, 0, SourceInfo{}))
-      return false;
-  }
-
-  for (const Record::Base &Base : llvm::reverse(R->bases())) {
-    if (!this->emitGetPtrBase(Base.Offset, SourceInfo{}))
-      return false;
-    if (!this->emitRecordDestruction(Base.R))
-      return false;
-    if (!this->emitPopPtr(SourceInfo{}))
-      return false;
-  }
+  const CXXDestructorDecl *Dtor = R->getDestructor();
+  if (!Dtor || Dtor->isTrivial())
+    return true;
 
-  // FIXME: Virtual bases.
-  return true;
+  assert(Dtor);
+  const Function *DtorFunc = getFunction(Dtor);
+  if (!DtorFunc)
+    return false;
+  assert(DtorFunc->hasThisPointer());
+  assert(DtorFunc->getNumParams() == 1);
+  if (!this->emitDupPtr(SourceInfo{}))
+    return false;
+  return this->emitCall(DtorFunc, 0, SourceInfo{});
 }
 /// When calling this, we have a pointer of the local-to-destroy
 /// on the stack.
diff --git a/clang/lib/AST/Interp/Compiler.h b/clang/lib/AST/Interp/Compiler.h
index d94d3613775a19..112219c49e8bdd 100644
--- a/clang/lib/AST/Interp/Compiler.h
+++ b/clang/lib/AST/Interp/Compiler.h
@@ -358,6 +358,7 @@ class Compiler : public ConstStmtVisitor<Compiler<Emitter>, bool>,
                              const QualType DerivedType);
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
   bool compileConstructor(const CXXConstructorDecl *Ctor);
+  bool compileDestructor(const CXXDestructorDecl *Dtor);
 
   bool checkLiteralType(const Expr *E);
 
diff --git a/clang/lib/AST/Interp/CompilerComplex.cpp b/clang/lib/AST/Interp/CompilerComplex.cpp
new file mode 100644
index 00000000000000..e22c72785373d1
--- /dev/null
+++ b/clang/lib/AST/Interp/CompilerComplex.cpp
@@ -0,0 +1,526 @@
+//===--- CompilerComplex.cpp.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ByteCodeEmitter.h"
+#include "Compiler.h"
+#include "Context.h"
+#include "Floating.h"
+#include "Function.h"
+#include "InterpShared.h"
+#include "PrimType.h"
+#include "Program.h"
+#include "clang/AST/Attr.h"
+
+using namespace clang;
+using namespace clang::interp;
+
+template <class Emitter>
+bool Compiler<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
+  // Prepare storage for result.
+  if (!Initializing) {
+    std::optional<unsigned> LocalIndex = allocateLocal(E);
+    if (!LocalIndex)
+      return false;
+    if (!this->emitGetPtrLocal(*LocalIndex, E))
+      return false;
+  }
+
+  // Both LHS and RHS might _not_ be of complex type, but one of them
+  // needs to be.
+  const Expr *LHS = E->getLHS();
+  const Expr *RHS = E->getRHS();
+
+  PrimType ResultElemT = this->classifyComplexElementType(E->getType());
+  unsigned ResultOffset = ~0u;
+  if (!DiscardResult)
+    ResultOffset = this->allocateLocalPrimitive(E, PT_Ptr, true, false);
+
+  // Save result pointer in ResultOffset
+  if (!this->DiscardResult) {
+    if (!this->emitDupPtr(E))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, ResultOffset, E))
+      return false;
+  }
+  QualType LHSType = LHS->getType();
+  if (const auto *AT = LHSType->getAs<AtomicType>())
+    LHSType = AT->getValueType();
+  QualType RHSType = RHS->getType();
+  if (const auto *AT = RHSType->getAs<AtomicType>())
+    RHSType = AT->getValueType();
+
+  bool LHSIsComplex = LHSType->isAnyComplexType();
+  unsigned LHSOffset;
+  bool RHSIsComplex = RHSType->isAnyComplexType();
+
+  // For ComplexComplex Mul, we have special ops to make their implementation
+  // easier.
+  BinaryOperatorKind Op = E->getOpcode();
+  if (Op == BO_Mul && LHSIsComplex && RHSIsComplex) {
+    assert(classifyPrim(LHSType->getAs<ComplexType>()->getElementType()) ==
+           classifyPrim(RHSType->getAs<ComplexType>()->getElementType()));
+    PrimType ElemT =
+        classifyPrim(LHSType->getAs<ComplexType>()->getElementType());
+    if (!this->visit(LHS))
+      return false;
+    if (!this->visit(RHS))
+      return false;
+    return this->emitMulc(ElemT, E);
+  }
+
+  if (Op == BO_Div && RHSIsComplex) {
+    QualType ElemQT = RHSType->getAs<ComplexType>()->getElementType();
+    PrimType ElemT = classifyPrim(ElemQT);
+    // If the LHS is not complex, we still need to do the full complex
+    // division, so just stub create a complex value and stub it out with
+    // the LHS and a zero.
+
+    if (!LHSIsComplex) {
+      // This is using the RHS type for the fake-complex LHS.
+      if (auto LHSO = allocateLocal(RHS))
+        LHSOffset = *LHSO;
+      else
+        return false;
+
+      if (!this->emitGetPtrLocal(LHSOffset, E))
+        return false;
+
+      if (!this->visit(LHS))
+        return false;
+      // real is LHS
+      if (!this->emitInitElem(ElemT, 0, E))
+        return false;
+      // imag is zero
+      if (!this->visitZeroInitializer(ElemT, ElemQT, E))
+        return false;
+      if (!this->emitInitElem(ElemT, 1, E))
+        return false;
+    } else {
+      if (!this->visit(LHS))
+        return false;
+    }
+
+    if (!this->visit(RHS))
+      return false;
+    return this->emitDivc(ElemT, E);
+  }
+
+  // Evaluate LHS and save value to LHSOffset.
+  if (LHSType->isAnyComplexType()) {
+    LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
+    if (!this->visit(LHS))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
+      return false;
+  } else {
+    PrimType LHST = classifyPrim(LHSType);
+    LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
+    if (!this->visit(LHS))
+      return false;
+    if (!this->emitSetLocal(LHST, LHSOffset, E))
+      return false;
+  }
+
+  // Same with RHS.
+  unsigned RHSOffset;
+  if (RHSType->isAnyComplexType()) {
+    RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
+    if (!this->visit(RHS))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
+      return false;
+  } else {
+    PrimType RHST = classifyPrim(RHSType);
+    RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
+    if (!this->visit(RHS))
+      return false;
+    if (!this->emitSetLocal(RHST, RHSOffset, E))
+      return false;
+  }
+
+  // For both LHS and RHS, either load the value from the complex pointer, or
+  // directly from the local variable. For index 1 (i.e. the imaginary part),
+  // just load 0 and do the operation anyway.
+  auto loadComplexValue = [this](bool IsComplex, bool LoadZero,
+                                 unsigned ElemIndex, unsigned Offset,
+                                 const Expr *E) -> bool {
+    if (IsComplex) {
+      if (!this->emitGetLocal(PT_Ptr, Offset, E))
+        return false;
+      return this->emitArrayElemPop(classifyComplexElementType(E->getType()),
+                                    ElemIndex, E);
+    }
+    if (ElemIndex == 0 || !LoadZero)
+      return this->emitGetLocal(classifyPrim(E->getType()), Offset, E);
+    return this->visitZeroInitializer(classifyPrim(E->getType()), E->getType(),
+                                      E);
+  };
+
+  // Now we can get pointers to the LHS and RHS from the offsets above.
+  for (unsigned ElemIndex = 0; ElemIndex != 2; ++ElemIndex) {
+    // Result pointer for the store later.
+    if (!this->DiscardResult) {
+      if (!this->emitGetLocal(PT_Ptr, ResultOffset, E))
+        return false;
+    }
+
+    // The actual operation.
+    switch (Op) {
+    case BO_Add:
+      if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS))
+        return false;
+
+      if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS))
+        return false;
+      if (ResultElemT == PT_Float) {
+        if (!this->emitAddf(getRoundingMode(E), E))
+          return false;
+      } else {
+        if (!this->emitAdd(ResultElemT, E))
+          return false;
+      }
+      break;
+    case BO_Sub:
+      if (!loadComplexValue(LHSIsComplex, true, ElemIndex, LHSOffset, LHS))
+        return false;
+
+      if (!loadComplexValue(RHSIsComplex, true, ElemIndex, RHSOffset, RHS))
+        return false;
+      if (ResultElemT == PT_Float) {
+        if (!this->emitSubf(getRoundingMode(E), E))
+          return false;
+      } else {
+        if (!this->emitSub(ResultElemT, E))
+          return false;
+      }
+      break;
+    case BO_Mul:
+      if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS))
+        return false;
+
+      if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS))
+        return false;
+
+      if (ResultElemT == PT_Float) {
+        if (!this->emitMulf(getRoundingMode(E), E))
+          return false;
+      } else {
+        if (!this->emitMul(ResultElemT, E))
+          return false;
+      }
+      break;
+    case BO_Div:
+      assert(!RHSIsComplex);
+      if (!loadComplexValue(LHSIsComplex, false, ElemIndex, LHSOffset, LHS))
+        return false;
+
+      if (!loadComplexValue(RHSIsComplex, false, ElemIndex, RHSOffset, RHS))
+        return false;
+
+      if (ResultElemT == PT_Float) {
+        if (!this->emitDivf(getRoundingMode(E), E))
+          return false;
+      } else {
+        if (!this->emitDiv(ResultElemT, E))
+          return false;
+      }
+      break;
+
+    default:
+      return false;
+    }
+
+    if (!this->DiscardResult) {
+      // Initialize array element with the value we just computed.
+      if (!this->emitInitElemPop(ResultElemT, ElemIndex, E))
+        return false;
+    } else {
+      if (!this->emitPop(ResultElemT, E))
+        return false;
+    }
+  }
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::emitComplexComparison(const Expr *LHS, const Expr *RHS,
+                                              const BinaryOperator *E) {
+  assert(E->isComparisonOp());
+  assert(!Initializing);
+  assert(!DiscardResult);
+
+  PrimType ElemT;
+  bool LHSIsComplex;
+  unsigned LHSOffset;
+  if (LHS->getType()->isAnyComplexType()) {
+    LHSIsComplex = true;
+    ElemT = classifyComplexElementType(LHS->getType());
+    LHSOffset = allocateLocalPrimitive(LHS, PT_Ptr, /*IsConst=*/true,
+                                       /*IsExtended=*/false);
+    if (!this->visit(LHS))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, LHSOffset, E))
+      return false;
+  } else {
+    LHSIsComplex = false;
+    PrimType LHST = classifyPrim(LHS->getType());
+    LHSOffset = this->allocateLocalPrimitive(LHS, LHST, true, false);
+    if (!this->visit(LHS))
+      return false;
+    if (!this->emitSetLocal(LHST, LHSOffset, E))
+      return false;
+  }
+
+  bool RHSIsComplex;
+  unsigned RHSOffset;
+  if (RHS->getType()->isAnyComplexType()) {
+    RHSIsComplex = true;
+    ElemT = classifyComplexElementType(RHS->getType());
+    RHSOffset = allocateLocalPrimitive(RHS, PT_Ptr, /*IsConst=*/true,
+                                       /*IsExtended=*/false);
+    if (!this->visit(RHS))
+      return false;
+    if (!this->emitSetLocal(PT_Ptr, RHSOffset, E))
+      return false;
+  } else {
+    RHSIsComplex = false;
+    PrimType RHST = classifyPrim(RHS->getType());
+    RHSOffset = this->allocateLocalPrimitive(RHS, RHST, true, false);
+    if (!this->visit(RHS))
+      return false;
+    if (!this->emitSetLocal(RHST, RHSOffset, E))
+      return false;
+  }
+
+  auto getElem = [&](unsigned LocalOffset, unsigned Index,
+                     bool IsComplex) -> bool {
+    if (IsComplex) {
+      if (!this->emitGetLocal(PT_Ptr, LocalOffset, E))
+        return false;
+      return this->emitArrayElemPop(ElemT, Index, E);
+    }
+    return this->emitGetLocal(ElemT, LocalOffset, E);
+  };
+
+  for (unsigned I = 0; I != 2; ++I) {
+    // Get both values.
+    if (!getElem(LHSOffset, I, LHSIsComplex))
+      return false;
+    if (!getElem(RHSOffset, I, RHSIsComplex))
+      return false;
+    // And compare them.
+    if (!this->emitEQ(ElemT, E))
+      return false;
+
+    if (!this->emitCastBoolUint8(E))
+      return false;
+  }
+
+  // We now have two bool values on the stack. Compare those.
+  if (!this->emitAddUint8(E))
+    return false;
+  if (!this->emitConstUint8(2, E))
+    return false;
+
+  if (E->getOpcode() == BO_EQ) {
+    if (!this->emitEQUint8(E))
+      return false;
+  } else if (E->getOpcode() == BO_NE) {
+    if (!this->emitNEUint8(E))
+      return false;
+  } else
+    return false;
+
+  // In C, this returns an int.
+  if (PrimType ResT = classifyPrim(E->getType()); ResT != PT_Bool)
+    return this->emitCast(PT_Bool, ResT, E);
+  return true;
+}
+
+/// Emits __real(SubExpr)
+template <class Emitter>
+bool Compiler<Emitter>::emitComplexReal(const Expr *SubExpr) {
+  assert(SubExpr->getType()->isAnyComplexType());
+
+  if (DiscardResult)
+    return this->discard(SubExpr);
+
+  if (!this->visit(SubExpr))
+    return false;
+  if (SubExpr->isLValue()) {
+    if (!this->emitConstUint8(0, SubExpr))
+      return false;
+    return this->emitArrayElemPtrPopUint8(SubExpr);
+  }
+
+  // Rvalue, load the actual element.
+  return this->emitArrayElemPop(classifyComplexElementType(SubExpr->getType()),
+                                0, SubExpr);
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::emitComplexBoolCast(const Expr *E) {
+  assert(!DiscardResult);
+  PrimType ElemT = classifyComplexElementType(E->getType());
+  // We emit the expression (__real(E) != 0 || __imag(E) != 0)
+  // for us, that means (bool)E[0] || (bool)E[1]
+  if (!this->emitArrayElem(ElemT, 0, E))
+    return false;
+  if (ElemT == PT_Float) {
+    if (!this->emitCastFloatingIntegral(PT_Bool, E))
+      return false;
+  } else {
+    if (!this->emitCast(ElemT, PT_Bool, E))
+      return false;
+  }
+
+  // We now have the bool value of E[0] on the stack.
+  LabelTy LabelTrue = this->getLabel();
+  if (!this->jumpTrue(LabelTrue))
+    return false;
+
+  if (!this->emitArrayElemPop(ElemT, 1, E))
+    return false;
+  if (ElemT == PT_Float) {
+    if (!this->emitCastFloatingIntegral(PT_Bool, E))
+      return false;
+  } else {
+    if (!this->emitCast(ElemT, PT_Bool, E))
+      return false;
+  }
+  // Leave the boolean value of E[1] on the stack.
+  LabelTy EndLabel = this->getLabel();
+  this->jump(EndLabel);
+
+  this->emitLabel(LabelTrue);
+  if (!this->emitPopPtr(E))
+    return false;
+  if (!this->emitConstBool(true, E))
+    return false;
+
+  this->fallthrough(EndLabel);
+  this->emitLabel(EndLabel);
+
+  return true;
+}
+
+template <class Emitter>
+bool Compiler<Emitter>::VisitComplexUnaryOperator(const UnaryOperator *E) {
+  const Expr *SubExpr = E->getSubExpr();
+  assert(SubExpr->getType()->isAnyComplexType());
+
+  if (DiscardResult)
+    return this->discard(SubExpr);
+
+  std::optional<PrimType> ResT = classify(E);
+  auto prepareResult = [=]() -> bool {
+    if (!ResT && !Initializing) {
+      std::optional<unsigned> LocalIndex = allocateLocal(SubExpr);
+      if (!LocalIndex)
+        return false;
+      return this->emitGetPtrLocal(*LocalIndex, E);
+    }
+
+    return true;
+  };
+
+  // The offset of the temporary, if we created one.
+  unsigned SubExprOffset = ~0u;
+  auto createTemp = [=, &SubExprOffset]() -> bool {
+    SubExprOffset = this->allocateLocalPrimitive(SubExpr, PT_Ptr, true, false);
+    if (!this->visit(SubExpr))
+      return false;
+    return this->emitSetLocal(PT_Ptr, SubExprOffset, E);
+  };
+
+  PrimType ElemT = classifyComplexElementType(SubExpr->getType());
+  auto getElem = [=](unsigned Offset, unsigned Index) -> bool {
+    if (!this->emitGetLocal(PT_Ptr, Offset, E))
+      return false;
+    return this->emitArrayElemPop(ElemT, Index, E);
+  };
+
+  switch (E->getOpcode()) {
+  case UO_Minus:
+    if (!prepareResult())
+      return false;
+    if (!createTemp())
+      return false;
+    for (unsigned I = 0; I != 2; ++I) {
+      if (!getElem(SubExprOffset, I))
+        return false;
+      if (!this->emitNeg(ElemT, E))
+        return false;
+      if (!this->emitInitElem(ElemT, I, E))
+        return false;
+    }
+    break;
+
+  case UO_Plus:   // +x
+  case UO_AddrOf: // &x
+  case UO_Deref:  // *x
+    return this->delegate(SubExpr);
+
+  case UO_LNot:
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitComplexBoolCast(SubExpr))
+      return false;
+    if (!this->emitInvBool(E))
+      return false;
+    if (PrimType ET = classifyPrim(E->getType()); ET != PT_Bool)
+      return this->emitCast(PT_Bool, ET, E);
+    return true;
+
+  case UO_Real:
+    return this->emitComplexReal(SubExpr);
+
+  case UO_Imag:
+    if (!this->visit(SubExpr))
+      return false;
+
+    if (SubExpr->isLValue()) {
+      if (!this->emitConstUint8(1, E))
+        return false;
+      return this->emitArrayElemPtrPopUint8(E);
+    }
+
+    // Since our _Complex implementation does not map to a primitive type,
+    // we sometimes have to do the lvalue-to-rvalue conversion here manually.
+    return this->emitArrayElemPop(classifyPrim(E->getType()), 1, E);
+
+  case UO_Not: // ~x
+    if (!this->visit(SubExpr))
+      return false;
+    // Negate the imaginary component.
+    if (!this->emitArrayElem(ElemT, 1, E))
+      return false;
+    if (!this->emitNeg(ElemT, E))
+      return false;
+    if (!this->emitInitElem(ElemT, 1, E))
+      return false;
+    return DiscardResult ? this->emitPopPtr(E) : true;
+
+  case UO_Extension:
+    return this->delegate(SubExpr);
+
+  default:
+    return this->emitInvalid(E);
+  }
+
+  return true;
+}
+
+namespace clang {
+namespace interp {
+
+template class Compiler<ByteCodeEmitter>;
+template class Compiler<EvalEmitter>;
+
+} // namespace interp
+} // namespace clang
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 4a50b4487b6654..94d613352ba227 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -871,23 +871,6 @@ static bool runRecordDestructor(InterpState &S, CodePtr OpPC,
     return false;
   }
 
-  // Fields.
-  for (const Record::Field &Field : llvm::reverse(R->fields())) {
-    const Descriptor *D = Field.Desc;
-    if (D->isRecord()) {
-      if (!runRecordDestructor(S, OpPC, BasePtr.atField(Field.Offset), D))
-        return false;
-    } else if (D->isCompositeArray()) {
-      const Descriptor *ElemDesc = Desc->ElemDesc;
-      assert(ElemDesc->isRecord());
-      for (unsigned I = 0; I != Desc->getNumElems(); ++I) {
-        if (!runRecordDestructor(S, OpPC, BasePtr.atIndex(I).narrow(),
-                                 ElemDesc))
-          return false;
-      }
-    }
-  }
-
   // Destructor of this record.
   if (const CXXDestructorDecl *Dtor = R->getDestructor();
       Dtor && !Dtor->isTrivial()) {
@@ -899,13 +882,6 @@ static bool runRecordDestructor(InterpState &S, CodePtr OpPC,
     if (!Call(S, OpPC, DtorFunc, 0))
       return false;
   }
-
-  // Bases.
-  for (const Record::Base &Base : llvm::reverse(R->bases())) {
-    if (!runRecordDestructor(S, OpPC, BasePtr.atField(Base.Offset), Base.Desc))
-      return false;
-  }
-
   return true;
 }
 
diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp
index 27108f957305f3..4530f28d694e84 100644
--- a/clang/lib/AST/Interp/InterpFrame.cpp
+++ b/clang/lib/AST/Interp/InterpFrame.cpp
@@ -102,8 +102,9 @@ void InterpFrame::popArgs() {
 }
 
 template <typename T>
-static void print(llvm::raw_ostream &OS, const T &V, ASTContext &, QualType) {
-  OS << V;
+static void print(llvm::raw_ostream &OS, const T &V, ASTContext &ASTCtx,
+                  QualType Ty) {
+  V.toAPValue(ASTCtx).printPretty(OS, ASTCtx, Ty);
 }
 
 template <>
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 388c927c9aa558..d50d4c7028c697 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2892,3 +2892,7 @@ void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
   AddChild("begin", [=] { OS << S->getStartingElementPos(); });
   AddChild("number of elements", [=] { OS << S->getDataElementCount(); });
 }
+
+void TextNodeDumper::VisitAtomicExpr(const AtomicExpr *AE) {
+  OS << ' ' << AE->getOpAsString();
+}
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 1f5fb630a47038..59c5927717933d 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -15,10 +15,12 @@
 #include "CGCXXABI.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
+#include "clang/AST/CharUnits.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/Cuda.h"
 #include "clang/CodeGen/CodeGenABITypes.h"
 #include "clang/CodeGen/ConstantInitBuilder.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Frontend/Offloading/Utility.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
 
 class CGNVCUDARuntime : public CGCUDARuntime {
 
+  /// The prefix used for function calls and section names (CUDA, HIP, LLVM)
+  StringRef Prefix;
+  /// TODO: We should transition the OpenMP section to LLVM/Offload
+  StringRef SectionPrefix;
+
 private:
   llvm::IntegerType *IntTy, *SizeTy;
   llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
     return DummyFunc;
   }
 
+  Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
+  Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+                                       FunctionArgList &Args);
   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
   void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
   std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
 } // end anonymous namespace
 
 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
-  if (CGM.getLangOpts().HIP)
-    return ((Twine("hip") + Twine(FuncName)).str());
-  return ((Twine("cuda") + Twine(FuncName)).str());
+  return (Prefix + FuncName).str();
 }
 std::string
 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
-  if (CGM.getLangOpts().HIP)
-    return ((Twine("__hip") + Twine(FuncName)).str());
-  return ((Twine("__cuda") + Twine(FuncName)).str());
+  return ("__" + Prefix + FuncName).str();
 }
 
 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
   SizeTy = CGM.SizeTy;
   VoidTy = CGM.VoidTy;
   PtrTy = CGM.UnqualPtrTy;
+
+  if (CGM.getLangOpts().OffloadViaLLVM) {
+    Prefix = "llvm";
+    SectionPrefix = "omp";
+  } else if (CGM.getLangOpts().HIP)
+    SectionPrefix = Prefix = "hip";
+  else
+    SectionPrefix = Prefix = "cuda";
 }
 
 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
   }
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
                          CudaFeature::CUDA_USES_NEW_LAUNCH) ||
-      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
+      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
+      (CGF.getLangOpts().OffloadViaLLVM))
     emitDeviceStubBodyNew(CGF, Args);
   else
     emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
-// array and kernels are launched using cudaLaunchKernel().
-void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
-                                            FunctionArgList &Args) {
-  // Build the shadow stack entry at the very start of the function.
+/// CUDA passes the arguments with a level of indirection. For example, a
+/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
+/// function. For the LLVM/offload launch we flatten the arguments into the
+/// struct directly. In addition, we include the size of the arguments, thus
+/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
+/// nullptr}. The last nullptr needs to be initialized to an array of pointers
+/// pointing to the arguments if we want to offload to the host.
+Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
+                                                      FunctionArgList &Args) {
+  SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
+  for (auto &Arg : Args)
+    ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
+  llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
+
+  auto *Int64Ty = CGF.Builder.getInt64Ty();
+  KernelLaunchParamsTypes.push_back(Int64Ty);
+  KernelLaunchParamsTypes.push_back(PtrTy);
+  KernelLaunchParamsTypes.push_back(PtrTy);
+
+  llvm::StructType *KernelLaunchParamsTy =
+      llvm::StructType::create(KernelLaunchParamsTypes);
+  Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
+      KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
+  Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
+      KernelLaunchParamsTy, CharUnits::fromQuantity(16),
+      "kernel_launch_params");
+
+  auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
+  CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
+  CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
+  CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
+                          CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
+
+  for (unsigned i = 0; i < Args.size(); ++i) {
+    auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
+    CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
+  }
 
+  return KernelLaunchParams;
+}
+
+Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
+                                           FunctionArgList &Args) {
   // Calculate amount of space we will need for all arguments.  If we have no
   // args, allocate a single pointer so we still have a valid pointer to the
   // argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
                         PtrTy, KernelArgs.emitRawPointer(CGF), i));
   }
+  return KernelArgs;
+}
+
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+                                            FunctionArgList &Args) {
+  // Build the shadow stack entry at the very start of the function.
+  Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
+                           ? prepareKernelArgsLLVMOffload(CGF, Args)
+                           : prepareKernelArgs(CGF, Args);
 
   llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
 
@@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
 // registered. The linker will provide a pointer to this section so we can
 // register the symbols with the linked device image.
 void CGNVCUDARuntime::createOffloadingEntries() {
-  StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
-                                            : "cuda_offloading_entries";
+  SmallVector<char, 32> Out;
+  StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+
   llvm::Module &M = CGM.getModule();
   for (KernelInfo &I : EmittedKernels)
     llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
     }
     return nullptr;
   }
-  if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
+  if (CGM.getLangOpts().OffloadViaLLVM ||
+      (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
     createOffloadingEntries();
   else
     return makeModuleCtorFunction();
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index ca5804018227ea..adc7cdbfded880 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -278,9 +278,9 @@ class CGObjCGNU : public CGObjCRuntime {
       Fields.addInt(IntTy, count);
       // int size; (only in GNUstep v2 ABI.
       if (isRuntime(ObjCRuntime::GNUstep, 2)) {
-        llvm::DataLayout td(&TheModule);
-        Fields.addInt(IntTy, td.getTypeSizeInBits(PropertyMetadataTy) /
-            CGM.getContext().getCharWidth());
+        const llvm::DataLayout &DL = TheModule.getDataLayout();
+        Fields.addInt(IntTy, DL.getTypeSizeInBits(PropertyMetadataTy) /
+                                 CGM.getContext().getCharWidth());
       }
       // struct objc_property_list *next;
       Fields.add(NULLPtr);
@@ -1190,9 +1190,9 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
     // int count;
     MethodList.addInt(IntTy, Methods.size());
     // int size; // sizeof(struct objc_method_description)
-    llvm::DataLayout td(&TheModule);
-    MethodList.addInt(IntTy, td.getTypeSizeInBits(ObjCMethodDescTy) /
-        CGM.getContext().getCharWidth());
+    const llvm::DataLayout &DL = TheModule.getDataLayout();
+    MethodList.addInt(IntTy, DL.getTypeSizeInBits(ObjCMethodDescTy) /
+                                 CGM.getContext().getCharWidth());
     // struct objc_method_description[]
     auto MethodArray = MethodList.beginArray(ObjCMethodDescTy);
     for (auto *M : Methods) {
@@ -1828,7 +1828,7 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
       int ivar_count = 0;
       for (const ObjCIvarDecl *IVD = classDecl->all_declared_ivar_begin(); IVD;
            IVD = IVD->getNextIvar()) ivar_count++;
-      llvm::DataLayout td(&TheModule);
+      const llvm::DataLayout &DL = TheModule.getDataLayout();
       // struct objc_ivar_list *ivars;
       ConstantInitBuilder b(CGM);
       auto ivarListBuilder = b.beginStruct();
@@ -1841,8 +1841,8 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
         PtrToInt8Ty,
         Int32Ty,
         Int32Ty);
-      ivarListBuilder.addInt(SizeTy, td.getTypeSizeInBits(ObjCIvarTy) /
-          CGM.getContext().getCharWidth());
+      ivarListBuilder.addInt(SizeTy, DL.getTypeSizeInBits(ObjCIvarTy) /
+                                         CGM.getContext().getCharWidth());
       // struct objc_ivar ivars[]
       auto ivarArrayBuilder = ivarListBuilder.beginArray();
       for (const ObjCIvarDecl *IVD = classDecl->all_declared_ivar_begin(); IVD;
@@ -3019,9 +3019,9 @@ GenerateMethodList(StringRef ClassName,
   bool isV2ABI = isRuntime(ObjCRuntime::GNUstep, 2);
   if (isV2ABI) {
     // size_t size;
-    llvm::DataLayout td(&TheModule);
-    MethodList.addInt(SizeTy, td.getTypeSizeInBits(ObjCMethodTy) /
-        CGM.getContext().getCharWidth());
+    const llvm::DataLayout &DL = TheModule.getDataLayout();
+    MethodList.addInt(SizeTy, DL.getTypeSizeInBits(ObjCMethodTy) /
+                                  CGM.getContext().getCharWidth());
     ObjCMethodTy =
       llvm::StructType::get(CGM.getLLVMContext(), {
         IMPTy,       // Method pointer
@@ -3161,10 +3161,9 @@ llvm::Constant *CGObjCGNU::GenerateClassStructure(
   Elements.addInt(LongTy, info);
   // instance_size
   if (isMeta) {
-    llvm::DataLayout td(&TheModule);
-    Elements.addInt(LongTy,
-                    td.getTypeSizeInBits(ClassTy) /
-                      CGM.getContext().getCharWidth());
+    const llvm::DataLayout &DL = TheModule.getDataLayout();
+    Elements.addInt(LongTy, DL.getTypeSizeInBits(ClassTy) /
+                                CGM.getContext().getCharWidth());
   } else
     Elements.add(InstanceSize);
   // ivars
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 2b2e23f1e5d7fb..7adc5a48b7c47b 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -1013,7 +1013,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
   }
 
   if (FD && (getLangOpts().OpenCL ||
-             (getLangOpts().HIP && getLangOpts().CUDAIsDevice))) {
+             ((getLangOpts().HIP || getLangOpts().OffloadViaLLVM) &&
+              getLangOpts().CUDAIsDevice))) {
     // Add metadata for a kernel function.
     EmitKernelMetadata(FD, Fn);
   }
diff --git a/clang/lib/CodeGen/CodeGenTBAA.cpp b/clang/lib/CodeGen/CodeGenTBAA.cpp
index b66b6234c9d3d4..b7e6a4d1adcc37 100644
--- a/clang/lib/CodeGen/CodeGenTBAA.cpp
+++ b/clang/lib/CodeGen/CodeGenTBAA.cpp
@@ -318,7 +318,7 @@ TBAAAccessInfo CodeGenTBAA::getAccessInfo(QualType AccessType) {
 }
 
 TBAAAccessInfo CodeGenTBAA::getVTablePtrAccessInfo(llvm::Type *VTablePtrType) {
-  llvm::DataLayout DL(&Module);
+  const llvm::DataLayout &DL = Module.getDataLayout();
   unsigned Size = DL.getPointerTypeSize(VTablePtrType);
   return TBAAAccessInfo(createScalarTypeNode("vtable pointer", getRoot(), Size),
                         Size);
diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
index 26ff4e4ac0a3b5..f71872e77fe823 100644
--- a/clang/lib/CodeGen/Targets/X86.cpp
+++ b/clang/lib/CodeGen/Targets/X86.cpp
@@ -3124,26 +3124,63 @@ RValue X86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
     CGF.Builder.CreateStore(V, CGF.Builder.CreateStructGEP(Tmp, 1));
 
     RegAddr = Tmp.withElementType(LTy);
-  } else if (neededInt) {
-    RegAddr = Address(CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, gp_offset),
-                      LTy, CharUnits::fromQuantity(8));
-
+  } else if (neededInt || neededSSE == 1) {
     // Copy to a temporary if necessary to ensure the appropriate alignment.
     auto TInfo = getContext().getTypeInfoInChars(Ty);
     uint64_t TySize = TInfo.Width.getQuantity();
     CharUnits TyAlign = TInfo.Align;
-
-    // Copy into a temporary if the type is more aligned than the
-    // register save area.
-    if (TyAlign.getQuantity() > 8) {
+    llvm::Type *CoTy = nullptr;
+    if (AI.isDirect())
+      CoTy = AI.getCoerceToType();
+
+    llvm::Value *GpOrFpOffset = neededInt ? gp_offset : fp_offset;
+    uint64_t Alignment = neededInt ? 8 : 16;
+    uint64_t RegSize = neededInt ? neededInt * 8 : 16;
+    // There are two cases require special handling:
+    // 1)
+    //    ```
+    //    struct {
+    //      struct {} a[8];
+    //      int b;
+    //    };
+    //    ```
+    //    The lower 8 bytes of the structure are not stored,
+    //    so an 8-byte offset is needed when accessing the structure.
+    // 2)
+    //   ```
+    //   struct {
+    //     long long a;
+    //     struct {} b;
+    //   };
+    //   ```
+    //   The stored size of this structure is smaller than its actual size,
+    //   which may lead to reading past the end of the register save area.
+    if (CoTy && (AI.getDirectOffset() == 8 || RegSize < TySize)) {
       Address Tmp = CGF.CreateMemTemp(Ty);
-      CGF.Builder.CreateMemCpy(Tmp, RegAddr, TySize, false);
-      RegAddr = Tmp;
+      llvm::Value *Addr =
+          CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, GpOrFpOffset);
+      llvm::Value *Src = CGF.Builder.CreateAlignedLoad(CoTy, Addr, TyAlign);
+      llvm::Value *PtrOffset =
+          llvm::ConstantInt::get(CGF.Int32Ty, AI.getDirectOffset());
+      Address Dst = Address(
+          CGF.Builder.CreateGEP(CGF.Int8Ty, Tmp.getBasePointer(), PtrOffset),
+          LTy, TyAlign);
+      CGF.Builder.CreateStore(Src, Dst);
+      RegAddr = Tmp.withElementType(LTy);
+    } else {
+      RegAddr =
+          Address(CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, GpOrFpOffset),
+                  LTy, CharUnits::fromQuantity(Alignment));
+
+      // Copy into a temporary if the type is more aligned than the
+      // register save area.
+      if (neededInt && TyAlign.getQuantity() > 8) {
+        Address Tmp = CGF.CreateMemTemp(Ty);
+        CGF.Builder.CreateMemCpy(Tmp, RegAddr, TySize, false);
+        RegAddr = Tmp;
+      }
     }
 
-  } else if (neededSSE == 1) {
-    RegAddr = Address(CGF.Builder.CreateGEP(CGF.Int8Ty, RegSaveArea, fp_offset),
-                      LTy, CharUnits::fromQuantity(16));
   } else {
     assert(neededSSE == 2 && "Invalid number of needed registers!");
     // SSE registers are spaced 16 bytes apart in the register save
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index f4e909b79389bc..e12416e51f8d24 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -786,11 +786,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
                    }) ||
       C.getInputArgs().hasArg(options::OPT_hip_link) ||
       C.getInputArgs().hasArg(options::OPT_hipstdpar);
+  bool UseLLVMOffload = C.getInputArgs().hasArg(
+      options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
   if (IsCuda && IsHIP) {
     Diag(clang::diag::err_drv_mix_cuda_hip);
     return;
   }
-  if (IsCuda) {
+  if (IsCuda && !UseLLVMOffload) {
     const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
     const llvm::Triple &HostTriple = HostTC->getTriple();
     auto OFK = Action::OFK_Cuda;
@@ -812,7 +814,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
         CudaInstallation.WarnIfUnsupportedVersion();
     }
     C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
-  } else if (IsHIP) {
+  } else if (IsHIP && !UseLLVMOffload) {
     if (auto *OMPTargetArg =
             C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
       Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -836,10 +838,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
   // We need to generate an OpenMP toolchain if the user specified targets with
   // the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
   bool IsOpenMPOffloading =
-      C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                               options::OPT_fno_openmp, false) &&
-      (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
-       C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
+      ((IsCuda || IsHIP) && UseLLVMOffload) ||
+      (C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+                                options::OPT_fno_openmp, false) &&
+       (C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
+        C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
   if (IsOpenMPOffloading) {
     // We expect that -fopenmp-targets is always used in conjunction with the
     // option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -867,7 +870,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       for (StringRef T : OpenMPTargets->getValues())
         OpenMPTriples.insert(T);
     } else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
-               !IsHIP && !IsCuda) {
+               ((!IsHIP && !IsCuda) || UseLLVMOffload)) {
       const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
       auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
       auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4152,6 +4155,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
 
   bool UseNewOffloadingDriver =
       C.isOffloadingHostKind(Action::OFK_OpenMP) ||
+      Args.hasFlag(options::OPT_foffload_via_llvm,
+                   options::OPT_fno_offload_via_llvm, false) ||
       Args.hasFlag(options::OPT_offload_new_driver,
                    options::OPT_no_offload_new_driver, false);
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c698d38b80e578..96aa930ea28612 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1095,6 +1095,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("__clang_openmp_device_functions.h");
   }
 
+  if (Args.hasArg(options::OPT_foffload_via_llvm)) {
+    // Add llvm_wrappers/* to our system include path.  This lets us wrap
+    // standard library headers and other headers.
+    SmallString<128> P(D.ResourceDir);
+    llvm::sys::path::append(P, "include", "llvm_offload_wrappers");
+    CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"});
+    if (JA.isDeviceOffloading(Action::OFK_OpenMP))
+      CmdArgs.push_back("__llvm_offload_device.h");
+    else
+      CmdArgs.push_back("__llvm_offload_host.h");
+  }
+
   // Add -i* options, and automatically translate to
   // -include-pch/-include-pth for transparent PCH support. It's
   // wonky, but we include looking for .gch so we can support seamless
@@ -6665,6 +6677,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // device offloading action other than OpenMP.
   if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
                    options::OPT_fno_openmp, false) &&
+      !Args.hasFlag(options::OPT_foffload_via_llvm,
+                    options::OPT_fno_offload_via_llvm, false) &&
       (JA.isDeviceOffloading(Action::OFK_None) ||
        JA.isDeviceOffloading(Action::OFK_OpenMP))) {
     switch (D.getOpenMPRuntime(Args)) {
@@ -6742,11 +6756,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
                        options::OPT_fno_openmp_extensions);
   }
-
-  // Forward the new driver to change offloading code generation.
-  if (Args.hasFlag(options::OPT_offload_new_driver,
-                   options::OPT_no_offload_new_driver, false))
+  // Forward the offload runtime change to code generation, liboffload implies
+  // new driver. Otherwise, check if we should forward the new driver to change
+  // offloading code generation.
+  if (Args.hasFlag(options::OPT_foffload_via_llvm,
+                   options::OPT_fno_offload_via_llvm, false)) {
+    CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
+  } else if (Args.hasFlag(options::OPT_offload_new_driver,
+                          options::OPT_no_offload_new_driver, false)) {
     CmdArgs.push_back("--offload-new-driver");
+  }
 
   SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
 
@@ -7778,6 +7797,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   // so that only the relevant declarations are emitted.
   if (IsOpenMPDevice) {
     CmdArgs.push_back("-fopenmp-is-target-device");
+    // If we are offloading cuda/hip via llvm, it's also "cuda device code".
+    if (Args.hasArg(options::OPT_foffload_via_llvm))
+      CmdArgs.push_back("-fcuda-is-device");
+
     if (OpenMPDeviceInput) {
       CmdArgs.push_back("-fopenmp-host-ir-file-path");
       CmdArgs.push_back(Args.MakeArgString(OpenMPDeviceInput->getFilename()));
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 6e9744607d9ebc..1cba3e1220264a 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1199,8 +1199,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
                              bool ForceStaticHostRuntime, bool IsOffloadingHost,
                              bool GompNeedsRT) {
   if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
-                    options::OPT_fno_openmp, false))
+                    options::OPT_fno_openmp, false)) {
+    // We need libomptarget (liboffload) if it's the choosen offloading runtime.
+    if (Args.hasFlag(options::OPT_foffload_via_llvm,
+                     options::OPT_fno_offload_via_llvm, false))
+      CmdArgs.push_back("-lomptarget");
     return false;
+  }
 
   Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 67a427b9d5ceee..3f9885b196ec55 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -839,17 +839,15 @@ void CudaToolChain::addClangTargetOptions(
           DeviceOffloadingKind == Action::OFK_Cuda) &&
          "Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
 
-  if (DeviceOffloadingKind == Action::OFK_Cuda) {
-    CC1Args.append(
-        {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
-
-    // Unsized function arguments used for variadics were introduced in CUDA-9.0
-    // We still do not support generating code that actually uses variadic
-    // arguments yet, but we do need to allow parsing them as recent CUDA
-    // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
-    if (CudaInstallation.version() >= CudaVersion::CUDA_90)
-      CC1Args.push_back("-fcuda-allow-variadic-functions");
-  }
+  CC1Args.append(
+      {"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
+
+  // Unsized function arguments used for variadics were introduced in CUDA-9.0
+  // We still do not support generating code that actually uses variadic
+  // arguments yet, but we do need to allow parsing them as recent CUDA
+  // headers rely on that. https://github.com/llvm/llvm-project/issues/58410
+  if (CudaInstallation.version() >= CudaVersion::CUDA_90)
+    CC1Args.push_back("-fcuda-allow-variadic-functions");
 
   if (DriverArgs.hasArg(options::OPT_nogpulib))
     return;
@@ -867,6 +865,13 @@ void CudaToolChain::addClangTargetOptions(
   CC1Args.push_back("-mlink-builtin-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
+  // For now, we don't use any Offload/OpenMP device runtime when we offload
+  // CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime
+  // and include the "generic" (or CUDA-specific) parts.
+  if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm,
+                         options::OPT_fno_offload_via_llvm, false))
+    return;
+
   clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
 
   if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 66485c91f64de9..cafbba0a0d0c5b 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2155,12 +2155,16 @@ bool UnwrappedLineParser::tryToParsePropertyAccessor() {
   // Track these as they do not require line breaks to be introduced.
   bool HasSpecialAccessor = false;
   bool IsTrivialPropertyAccessor = true;
+  bool HasAttribute = false;
   while (!eof()) {
-    if (Tok->isAccessSpecifierKeyword() ||
-        Tok->isOneOf(tok::semi, Keywords.kw_internal, Keywords.kw_get,
-                     Keywords.kw_init, Keywords.kw_set)) {
-      if (Tok->isOneOf(Keywords.kw_get, Keywords.kw_init, Keywords.kw_set))
+    if (const bool IsAccessorKeyword =
+            Tok->isOneOf(Keywords.kw_get, Keywords.kw_init, Keywords.kw_set);
+        IsAccessorKeyword || Tok->isAccessSpecifierKeyword() ||
+        Tok->isOneOf(tok::l_square, tok::semi, Keywords.kw_internal)) {
+      if (IsAccessorKeyword)
         HasSpecialAccessor = true;
+      else if (Tok->is(tok::l_square))
+        HasAttribute = true;
       Tok = Tokens->getNextToken();
       continue;
     }
@@ -2169,7 +2173,7 @@ bool UnwrappedLineParser::tryToParsePropertyAccessor() {
     break;
   }
 
-  if (!HasSpecialAccessor) {
+  if (!HasSpecialAccessor || HasAttribute) {
     Tokens->setPosition(StoredPosition);
     return false;
   }
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 6242b5a7d9fe39..1364641a9b71e1 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1659,9 +1659,8 @@ static void pruneModuleCache(const HeaderSearchOptions &HSOpts) {
   // Walk the entire module cache, looking for unused module files and module
   // indices.
   std::error_code EC;
-  SmallString<128> ModuleCachePathNative;
-  llvm::sys::path::native(HSOpts.ModuleCachePath, ModuleCachePathNative);
-  for (llvm::sys::fs::directory_iterator Dir(ModuleCachePathNative, EC), DirEnd;
+  for (llvm::sys::fs::directory_iterator Dir(HSOpts.ModuleCachePath, EC),
+       DirEnd;
        Dir != DirEnd && !EC; Dir.increment(EC)) {
     // If we don't have a directory, there's nothing to look into.
     if (!llvm::sys::fs::is_directory(Dir->path()))
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b61aeca6bbc910..598bc556e8330a 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -333,6 +333,12 @@ set(openmp_wrapper_files
   openmp_wrappers/new
 )
 
+set(llvm_offload_wrapper_files
+  llvm_offload_wrappers/__llvm_offload.h
+  llvm_offload_wrappers/__llvm_offload_host.h
+  llvm_offload_wrappers/__llvm_offload_device.h
+)
+
 set(llvm_libc_wrapper_files
   llvm_libc_wrappers/assert.h
   llvm_libc_wrappers/stdio.h
@@ -383,7 +389,7 @@ endfunction(clang_generate_header)
 # Copy header files from the source directory to the build directory
 foreach( f ${files} ${cuda_wrapper_files} ${cuda_wrapper_bits_files}
            ${ppc_wrapper_files} ${openmp_wrapper_files} ${zos_wrapper_files} ${hlsl_files}
-           ${llvm_libc_wrapper_files})
+	   ${llvm_libc_wrapper_files} ${llvm_offload_wrapper_files})
   copy_header_to_output_dir(${CMAKE_CURRENT_SOURCE_DIR} ${f})
 endforeach( f )
 
@@ -509,6 +515,7 @@ add_header_target("hlsl-resource-headers" ${hlsl_files})
 add_header_target("opencl-resource-headers" ${opencl_files})
 add_header_target("llvm-libc-resource-headers" ${llvm_libc_wrapper_files})
 add_header_target("openmp-resource-headers" ${openmp_wrapper_files})
+add_header_target("llvm-offload-resource-headers" ${llvm_libc_wrapper_files})
 add_header_target("windows-resource-headers" ${windows_only_files})
 add_header_target("utility-resource-headers" ${utility_files})
 
@@ -550,6 +557,11 @@ install(
   DESTINATION ${header_install_dir}/openmp_wrappers
   COMPONENT clang-resource-headers)
 
+install(
+  FILES ${llvm_offload_wrapper_files}
+  DESTINATION ${header_install_dir}/llvm_offload_wrappers
+  COMPONENT clang-resource-headers)
+
 install(
   FILES ${zos_wrapper_files}
   DESTINATION ${header_install_dir}/zos_wrappers
@@ -712,8 +724,8 @@ install(
   COMPONENT openmp-resource-headers)
 
 install(
-  FILES ${openmp_wrapper_files}
-  DESTINATION ${header_install_dir}/openmp_wrappers
+  FILES ${llvm_offload_wrapper_files}
+  DESTINATION ${header_install_dir}/llvm_offload_wrappers
   EXCLUDE_FROM_ALL
   COMPONENT openmp-resource-headers)
 
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
new file mode 100644
index 00000000000000..2898898904e299
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload.h
@@ -0,0 +1,31 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include <stddef.h>
+
+#define __host__ __attribute__((host))
+#define __device__ __attribute__((device))
+#define __global__ __attribute__((global))
+#define __shared__ __attribute__((shared))
+#define __constant__ __attribute__((constant))
+#define __managed__ __attribute__((managed))
+
+extern "C" {
+
+typedef struct dim3 {
+  dim3() {}
+  dim3(unsigned x) : x(x) {}
+  unsigned x = 0, y = 0, z = 0;
+} dim3;
+
+// TODO: For some reason the CUDA device compilation requires this declaration
+// to be present on the device while it is only used on the host.
+unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
new file mode 100644
index 00000000000000..1a813b331515b2
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_device.h
@@ -0,0 +1,10 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
diff --git a/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
new file mode 100644
index 00000000000000..160289d169b55e
--- /dev/null
+++ b/clang/lib/Headers/llvm_offload_wrappers/__llvm_offload_host.h
@@ -0,0 +1,15 @@
+/*===------ LLVM/Offload helpers for kernel languages (CUDA/HIP) -*- c++ -*-===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#include "__llvm_offload.h"
+
+extern "C" {
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
+}
diff --git a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
index d5b6846b034885..3e354c63efc668 100644
--- a/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
+++ b/clang/lib/Headers/openmp_wrappers/__clang_openmp_device_functions.h
@@ -10,17 +10,15 @@
 #ifndef __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 #define __CLANG_OPENMP_DEVICE_FUNCTIONS_H__
 
-#ifndef _OPENMP
-#error "This file is for OpenMP compilation only."
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#ifdef __NVPTX__
 #pragma omp begin declare variant match(                                       \
     device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
 
+#pragma push_macro("__CUDA__")
 #define __CUDA__
 #define __OPENMP_NVPTX__
 
@@ -31,9 +29,10 @@ extern "C" {
 #include <__clang_cuda_device_functions.h>
 
 #undef __OPENMP_NVPTX__
-#undef __CUDA__
+#pragma pop_macro("__CUDA__")
 
 #pragma omp end declare variant
+#endif
 
 #ifdef __AMDGCN__
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index d2210e7e18628a..4914c10e62d0c5 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -267,7 +267,6 @@ std::string HeaderSearch::getCachedModuleFileNameImpl(StringRef ModuleName,
     return {};
 
   SmallString<256> Result(CachePath);
-  llvm::sys::fs::make_absolute(Result);
 
   if (HSOpts->DisableModuleHash) {
     llvm::sys::path::append(Result, ModuleName + ".pcm");
diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 2c49c1f64b2da8..65b56bd1c8efc7 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -605,6 +605,10 @@ static void ProcessAPINotes(Sema &S, TagDecl *D, const api_notes::TagInfo &Info,
     D->addAttr(
         SwiftAttrAttr::Create(S.Context, "release:" + ReleaseOp.value()));
 
+  if (auto ConformsTo = Info.SwiftConformance)
+    D->addAttr(
+        SwiftAttrAttr::Create(S.Context, "conforms_to:" + ConformsTo.value()));
+
   if (auto Copyable = Info.isSwiftCopyable()) {
     if (!*Copyable)
       D->addAttr(SwiftAttrAttr::Create(S.Context, "~Copyable"));
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 580b9872c6a1de..ec37c0df56c671 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -1068,6 +1068,9 @@ void SemaCUDA::inheritTargetAttrs(FunctionDecl *FD,
 }
 
 std::string SemaCUDA::getConfigureFuncName() const {
+  if (getLangOpts().OffloadViaLLVM)
+    return "__llvmPushCallConfiguration";
+
   if (getLangOpts().HIP)
     return getLangOpts().HIPUseNewLaunchAPI ? "__hipPushCallConfiguration"
                                             : "hipConfigureCall";
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index c118f3818467d9..a9199f7e50f5dc 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -76,402 +76,407 @@ using namespace clang;
 using namespace serialization;
 
 //===----------------------------------------------------------------------===//
-// Declaration deserialization
+// Declaration Merging
 //===----------------------------------------------------------------------===//
 
-namespace clang {
-
-  class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
-    ASTReader &Reader;
-    ASTRecordReader &Record;
-    ASTReader::RecordLocation Loc;
-    const GlobalDeclID ThisDeclID;
-    const SourceLocation ThisDeclLoc;
-
-    using RecordData = ASTReader::RecordData;
+namespace {
+/// Results from loading a RedeclarableDecl.
+class RedeclarableResult {
+  Decl *MergeWith;
+  GlobalDeclID FirstID;
+  bool IsKeyDecl;
 
-    TypeID DeferredTypeID = 0;
-    unsigned AnonymousDeclNumber = 0;
-    GlobalDeclID NamedDeclForTagDecl = GlobalDeclID();
-    IdentifierInfo *TypedefNameForLinkage = nullptr;
+public:
+  RedeclarableResult(Decl *MergeWith, GlobalDeclID FirstID, bool IsKeyDecl)
+      : MergeWith(MergeWith), FirstID(FirstID), IsKeyDecl(IsKeyDecl) {}
 
-    ///A flag to carry the information for a decl from the entity is
-    /// used. We use it to delay the marking of the canonical decl as used until
-    /// the entire declaration is deserialized and merged.
-    bool IsDeclMarkedUsed = false;
+  /// Retrieve the first ID.
+  GlobalDeclID getFirstID() const { return FirstID; }
 
-    uint64_t GetCurrentCursorOffset();
+  /// Is this declaration a key declaration?
+  bool isKeyDecl() const { return IsKeyDecl; }
 
-    uint64_t ReadLocalOffset() {
-      uint64_t LocalOffset = Record.readInt();
-      assert(LocalOffset < Loc.Offset && "offset point after current record");
-      return LocalOffset ? Loc.Offset - LocalOffset : 0;
-    }
+  /// Get a known declaration that this should be merged with, if
+  /// any.
+  Decl *getKnownMergeTarget() const { return MergeWith; }
+};
+} // namespace
 
-    uint64_t ReadGlobalOffset() {
-      uint64_t Local = ReadLocalOffset();
-      return Local ? Record.getGlobalBitOffset(Local) : 0;
-    }
+namespace clang {
+class ASTDeclMerger {
+  ASTReader &Reader;
 
-    SourceLocation readSourceLocation() {
-      return Record.readSourceLocation();
-    }
+public:
+  ASTDeclMerger(ASTReader &Reader) : Reader(Reader) {}
+
+  void mergeLambda(CXXRecordDecl *D, RedeclarableResult &Redecl, Decl &Context,
+                   unsigned Number);
+
+  /// \param KeyDeclID the decl ID of the key declaration \param D.
+  /// GlobalDeclID() if \param is not a key declaration.
+  /// See the comments of ASTReader::KeyDecls for the explanation
+  /// of key declaration.
+  template <typename T>
+  void mergeRedeclarableImpl(Redeclarable<T> *D, T *Existing,
+                             GlobalDeclID KeyDeclID);
+
+  template <typename T>
+  void mergeRedeclarable(Redeclarable<T> *D, T *Existing,
+                         RedeclarableResult &Redecl) {
+    mergeRedeclarableImpl(
+        D, Existing, Redecl.isKeyDecl() ? Redecl.getFirstID() : GlobalDeclID());
+  }
 
-    SourceRange readSourceRange() {
-      return Record.readSourceRange();
-    }
+  void mergeTemplatePattern(RedeclarableTemplateDecl *D,
+                            RedeclarableTemplateDecl *Existing, bool IsKeyDecl);
 
-    TypeSourceInfo *readTypeSourceInfo() {
-      return Record.readTypeSourceInfo();
-    }
+  void MergeDefinitionData(CXXRecordDecl *D,
+                           struct CXXRecordDecl::DefinitionData &&NewDD);
+  void MergeDefinitionData(ObjCInterfaceDecl *D,
+                           struct ObjCInterfaceDecl::DefinitionData &&NewDD);
+  void MergeDefinitionData(ObjCProtocolDecl *D,
+                           struct ObjCProtocolDecl::DefinitionData &&NewDD);
+};
+} // namespace clang
 
-    GlobalDeclID readDeclID() { return Record.readDeclID(); }
+//===----------------------------------------------------------------------===//
+// Declaration deserialization
+//===----------------------------------------------------------------------===//
 
-    std::string readString() {
-      return Record.readString();
-    }
+namespace clang {
+class ASTDeclReader : public DeclVisitor<ASTDeclReader, void> {
+  ASTReader &Reader;
+  ASTDeclMerger MergeImpl;
+  ASTRecordReader &Record;
+  ASTReader::RecordLocation Loc;
+  const GlobalDeclID ThisDeclID;
+  const SourceLocation ThisDeclLoc;
+
+  using RecordData = ASTReader::RecordData;
+
+  TypeID DeferredTypeID = 0;
+  unsigned AnonymousDeclNumber = 0;
+  GlobalDeclID NamedDeclForTagDecl = GlobalDeclID();
+  IdentifierInfo *TypedefNameForLinkage = nullptr;
+
+  /// A flag to carry the information for a decl from the entity is
+  ///  used. We use it to delay the marking of the canonical decl as used until
+  ///  the entire declaration is deserialized and merged.
+  bool IsDeclMarkedUsed = false;
+
+  uint64_t GetCurrentCursorOffset();
+
+  uint64_t ReadLocalOffset() {
+    uint64_t LocalOffset = Record.readInt();
+    assert(LocalOffset < Loc.Offset && "offset point after current record");
+    return LocalOffset ? Loc.Offset - LocalOffset : 0;
+  }
 
-    void readDeclIDList(SmallVectorImpl<GlobalDeclID> &IDs) {
-      for (unsigned I = 0, Size = Record.readInt(); I != Size; ++I)
-        IDs.push_back(readDeclID());
-    }
+  uint64_t ReadGlobalOffset() {
+    uint64_t Local = ReadLocalOffset();
+    return Local ? Record.getGlobalBitOffset(Local) : 0;
+  }
 
-    Decl *readDecl() {
-      return Record.readDecl();
-    }
+  SourceLocation readSourceLocation() { return Record.readSourceLocation(); }
 
-    template<typename T>
-    T *readDeclAs() {
-      return Record.readDeclAs<T>();
-    }
+  SourceRange readSourceRange() { return Record.readSourceRange(); }
 
-    serialization::SubmoduleID readSubmoduleID() {
-      if (Record.getIdx() == Record.size())
-        return 0;
+  TypeSourceInfo *readTypeSourceInfo() { return Record.readTypeSourceInfo(); }
 
-      return Record.getGlobalSubmoduleID(Record.readInt());
-    }
+  GlobalDeclID readDeclID() { return Record.readDeclID(); }
 
-    Module *readModule() {
-      return Record.getSubmodule(readSubmoduleID());
-    }
+  std::string readString() { return Record.readString(); }
 
-    void ReadCXXRecordDefinition(CXXRecordDecl *D, bool Update,
-                                 Decl *LambdaContext = nullptr,
-                                 unsigned IndexInLambdaContext = 0);
-    void ReadCXXDefinitionData(struct CXXRecordDecl::DefinitionData &Data,
-                               const CXXRecordDecl *D, Decl *LambdaContext,
-                               unsigned IndexInLambdaContext);
-    void MergeDefinitionData(CXXRecordDecl *D,
-                             struct CXXRecordDecl::DefinitionData &&NewDD);
-    void ReadObjCDefinitionData(struct ObjCInterfaceDecl::DefinitionData &Data);
-    void MergeDefinitionData(ObjCInterfaceDecl *D,
-                             struct ObjCInterfaceDecl::DefinitionData &&NewDD);
-    void ReadObjCDefinitionData(struct ObjCProtocolDecl::DefinitionData &Data);
-    void MergeDefinitionData(ObjCProtocolDecl *D,
-                             struct ObjCProtocolDecl::DefinitionData &&NewDD);
-
-    static DeclContext *getPrimaryDCForAnonymousDecl(DeclContext *LexicalDC);
-
-    static NamedDecl *getAnonymousDeclForMerging(ASTReader &Reader,
-                                                 DeclContext *DC,
-                                                 unsigned Index);
-    static void setAnonymousDeclForMerging(ASTReader &Reader, DeclContext *DC,
-                                           unsigned Index, NamedDecl *D);
-
-    /// Commit to a primary definition of the class RD, which is known to be
-    /// a definition of the class. We might not have read the definition data
-    /// for it yet. If we haven't then allocate placeholder definition data
-    /// now too.
-    static CXXRecordDecl *getOrFakePrimaryClassDefinition(ASTReader &Reader,
-                                                          CXXRecordDecl *RD);
-
-    /// Results from loading a RedeclarableDecl.
-    class RedeclarableResult {
-      Decl *MergeWith;
-      GlobalDeclID FirstID;
-      bool IsKeyDecl;
+  void readDeclIDList(SmallVectorImpl<GlobalDeclID> &IDs) {
+    for (unsigned I = 0, Size = Record.readInt(); I != Size; ++I)
+      IDs.push_back(readDeclID());
+  }
 
-    public:
-      RedeclarableResult(Decl *MergeWith, GlobalDeclID FirstID, bool IsKeyDecl)
-          : MergeWith(MergeWith), FirstID(FirstID), IsKeyDecl(IsKeyDecl) {}
+  Decl *readDecl() { return Record.readDecl(); }
 
-      /// Retrieve the first ID.
-      GlobalDeclID getFirstID() const { return FirstID; }
+  template <typename T> T *readDeclAs() { return Record.readDeclAs<T>(); }
 
-      /// Is this declaration a key declaration?
-      bool isKeyDecl() const { return IsKeyDecl; }
+  serialization::SubmoduleID readSubmoduleID() {
+    if (Record.getIdx() == Record.size())
+      return 0;
 
-      /// Get a known declaration that this should be merged with, if
-      /// any.
-      Decl *getKnownMergeTarget() const { return MergeWith; }
-    };
+    return Record.getGlobalSubmoduleID(Record.readInt());
+  }
 
-    /// Class used to capture the result of searching for an existing
-    /// declaration of a specific kind and name, along with the ability
-    /// to update the place where this result was found (the declaration
-    /// chain hanging off an identifier or the DeclContext we searched in)
-    /// if requested.
-    class FindExistingResult {
-      ASTReader &Reader;
-      NamedDecl *New = nullptr;
-      NamedDecl *Existing = nullptr;
-      bool AddResult = false;
-      unsigned AnonymousDeclNumber = 0;
-      IdentifierInfo *TypedefNameForLinkage = nullptr;
+  Module *readModule() { return Record.getSubmodule(readSubmoduleID()); }
+
+  void ReadCXXRecordDefinition(CXXRecordDecl *D, bool Update,
+                               Decl *LambdaContext = nullptr,
+                               unsigned IndexInLambdaContext = 0);
+  void ReadCXXDefinitionData(struct CXXRecordDecl::DefinitionData &Data,
+                             const CXXRecordDecl *D, Decl *LambdaContext,
+                             unsigned IndexInLambdaContext);
+  void ReadObjCDefinitionData(struct ObjCInterfaceDecl::DefinitionData &Data);
+  void ReadObjCDefinitionData(struct ObjCProtocolDecl::DefinitionData &Data);
+
+  static DeclContext *getPrimaryDCForAnonymousDecl(DeclContext *LexicalDC);
+
+  static NamedDecl *getAnonymousDeclForMerging(ASTReader &Reader,
+                                               DeclContext *DC, unsigned Index);
+  static void setAnonymousDeclForMerging(ASTReader &Reader, DeclContext *DC,
+                                         unsigned Index, NamedDecl *D);
+
+  /// Commit to a primary definition of the class RD, which is known to be
+  /// a definition of the class. We might not have read the definition data
+  /// for it yet. If we haven't then allocate placeholder definition data
+  /// now too.
+  static CXXRecordDecl *getOrFakePrimaryClassDefinition(ASTReader &Reader,
+                                                        CXXRecordDecl *RD);
+
+  /// Class used to capture the result of searching for an existing
+  /// declaration of a specific kind and name, along with the ability
+  /// to update the place where this result was found (the declaration
+  /// chain hanging off an identifier or the DeclContext we searched in)
+  /// if requested.
+  class FindExistingResult {
+    ASTReader &Reader;
+    NamedDecl *New = nullptr;
+    NamedDecl *Existing = nullptr;
+    bool AddResult = false;
+    unsigned AnonymousDeclNumber = 0;
+    IdentifierInfo *TypedefNameForLinkage = nullptr;
 
-    public:
-      FindExistingResult(ASTReader &Reader) : Reader(Reader) {}
-
-      FindExistingResult(ASTReader &Reader, NamedDecl *New, NamedDecl *Existing,
-                         unsigned AnonymousDeclNumber,
-                         IdentifierInfo *TypedefNameForLinkage)
-          : Reader(Reader), New(New), Existing(Existing), AddResult(true),
-            AnonymousDeclNumber(AnonymousDeclNumber),
-            TypedefNameForLinkage(TypedefNameForLinkage) {}
-
-      FindExistingResult(FindExistingResult &&Other)
-          : Reader(Other.Reader), New(Other.New), Existing(Other.Existing),
-            AddResult(Other.AddResult),
-            AnonymousDeclNumber(Other.AnonymousDeclNumber),
-            TypedefNameForLinkage(Other.TypedefNameForLinkage) {
-        Other.AddResult = false;
-      }
+  public:
+    FindExistingResult(ASTReader &Reader) : Reader(Reader) {}
+
+    FindExistingResult(ASTReader &Reader, NamedDecl *New, NamedDecl *Existing,
+                       unsigned AnonymousDeclNumber,
+                       IdentifierInfo *TypedefNameForLinkage)
+        : Reader(Reader), New(New), Existing(Existing), AddResult(true),
+          AnonymousDeclNumber(AnonymousDeclNumber),
+          TypedefNameForLinkage(TypedefNameForLinkage) {}
+
+    FindExistingResult(FindExistingResult &&Other)
+        : Reader(Other.Reader), New(Other.New), Existing(Other.Existing),
+          AddResult(Other.AddResult),
+          AnonymousDeclNumber(Other.AnonymousDeclNumber),
+          TypedefNameForLinkage(Other.TypedefNameForLinkage) {
+      Other.AddResult = false;
+    }
 
-      FindExistingResult &operator=(FindExistingResult &&) = delete;
-      ~FindExistingResult();
+    FindExistingResult &operator=(FindExistingResult &&) = delete;
+    ~FindExistingResult();
 
-      /// Suppress the addition of this result into the known set of
-      /// names.
-      void suppress() { AddResult = false; }
+    /// Suppress the addition of this result into the known set of
+    /// names.
+    void suppress() { AddResult = false; }
 
-      operator NamedDecl*() const { return Existing; }
+    operator NamedDecl *() const { return Existing; }
 
-      template<typename T>
-      operator T*() const { return dyn_cast_or_null<T>(Existing); }
-    };
+    template <typename T> operator T *() const {
+      return dyn_cast_or_null<T>(Existing);
+    }
+  };
 
-    static DeclContext *getPrimaryContextForMerging(ASTReader &Reader,
-                                                    DeclContext *DC);
-    FindExistingResult findExisting(NamedDecl *D);
+  static DeclContext *getPrimaryContextForMerging(ASTReader &Reader,
+                                                  DeclContext *DC);
+  FindExistingResult findExisting(NamedDecl *D);
 
-  public:
-    ASTDeclReader(ASTReader &Reader, ASTRecordReader &Record,
-                  ASTReader::RecordLocation Loc, GlobalDeclID thisDeclID,
-                  SourceLocation ThisDeclLoc)
-        : Reader(Reader), Record(Record), Loc(Loc), ThisDeclID(thisDeclID),
-          ThisDeclLoc(ThisDeclLoc) {}
-
-    template <typename T>
-    static void AddLazySpecializations(T *D,
-                                       SmallVectorImpl<GlobalDeclID> &IDs) {
-      if (IDs.empty())
-        return;
+public:
+  ASTDeclReader(ASTReader &Reader, ASTRecordReader &Record,
+                ASTReader::RecordLocation Loc, GlobalDeclID thisDeclID,
+                SourceLocation ThisDeclLoc)
+      : Reader(Reader), MergeImpl(Reader), Record(Record), Loc(Loc),
+        ThisDeclID(thisDeclID), ThisDeclLoc(ThisDeclLoc) {}
+
+  template <typename T>
+  static void AddLazySpecializations(T *D, SmallVectorImpl<GlobalDeclID> &IDs) {
+    if (IDs.empty())
+      return;
 
-      // FIXME: We should avoid this pattern of getting the ASTContext.
-      ASTContext &C = D->getASTContext();
+    // FIXME: We should avoid this pattern of getting the ASTContext.
+    ASTContext &C = D->getASTContext();
 
-      auto *&LazySpecializations = D->getCommonPtr()->LazySpecializations;
+    auto *&LazySpecializations = D->getCommonPtr()->LazySpecializations;
 
-      if (auto &Old = LazySpecializations) {
-        IDs.insert(IDs.end(), Old + 1, Old + 1 + Old[0].getRawValue());
-        llvm::sort(IDs);
-        IDs.erase(std::unique(IDs.begin(), IDs.end()), IDs.end());
-      }
+    if (auto &Old = LazySpecializations) {
+      IDs.insert(IDs.end(), Old + 1, Old + 1 + Old[0].getRawValue());
+      llvm::sort(IDs);
+      IDs.erase(std::unique(IDs.begin(), IDs.end()), IDs.end());
+    }
 
-      auto *Result = new (C) GlobalDeclID[1 + IDs.size()];
-      *Result = GlobalDeclID(IDs.size());
+    auto *Result = new (C) GlobalDeclID[1 + IDs.size()];
+    *Result = GlobalDeclID(IDs.size());
 
-      std::copy(IDs.begin(), IDs.end(), Result + 1);
+    std::copy(IDs.begin(), IDs.end(), Result + 1);
 
-      LazySpecializations = Result;
-    }
-
-    template <typename DeclT>
-    static Decl *getMostRecentDeclImpl(Redeclarable<DeclT> *D);
-    static Decl *getMostRecentDeclImpl(...);
-    static Decl *getMostRecentDecl(Decl *D);
+    LazySpecializations = Result;
+  }
 
-    static void mergeInheritableAttributes(ASTReader &Reader, Decl *D,
-                                           Decl *Previous);
+  template <typename DeclT>
+  static Decl *getMostRecentDeclImpl(Redeclarable<DeclT> *D);
+  static Decl *getMostRecentDeclImpl(...);
+  static Decl *getMostRecentDecl(Decl *D);
 
-    template <typename DeclT>
-    static void attachPreviousDeclImpl(ASTReader &Reader,
-                                       Redeclarable<DeclT> *D, Decl *Previous,
-                                       Decl *Canon);
-    static void attachPreviousDeclImpl(ASTReader &Reader, ...);
-    static void attachPreviousDecl(ASTReader &Reader, Decl *D, Decl *Previous,
-                                   Decl *Canon);
+  template <typename DeclT>
+  static void attachPreviousDeclImpl(ASTReader &Reader, Redeclarable<DeclT> *D,
+                                     Decl *Previous, Decl *Canon);
+  static void attachPreviousDeclImpl(ASTReader &Reader, ...);
+  static void attachPreviousDecl(ASTReader &Reader, Decl *D, Decl *Previous,
+                                 Decl *Canon);
 
-    template <typename DeclT>
-    static void attachLatestDeclImpl(Redeclarable<DeclT> *D, Decl *Latest);
-    static void attachLatestDeclImpl(...);
-    static void attachLatestDecl(Decl *D, Decl *latest);
+  template <typename DeclT>
+  static void attachLatestDeclImpl(Redeclarable<DeclT> *D, Decl *Latest);
+  static void attachLatestDeclImpl(...);
+  static void attachLatestDecl(Decl *D, Decl *latest);
 
-    template <typename DeclT>
-    static void markIncompleteDeclChainImpl(Redeclarable<DeclT> *D);
-    static void markIncompleteDeclChainImpl(...);
+  template <typename DeclT>
+  static void markIncompleteDeclChainImpl(Redeclarable<DeclT> *D);
+  static void markIncompleteDeclChainImpl(...);
 
-    void ReadFunctionDefinition(FunctionDecl *FD);
-    void Visit(Decl *D);
+  void ReadFunctionDefinition(FunctionDecl *FD);
+  void Visit(Decl *D);
 
-    void UpdateDecl(Decl *D, SmallVectorImpl<GlobalDeclID> &);
+  void UpdateDecl(Decl *D, SmallVectorImpl<GlobalDeclID> &);
 
-    static void setNextObjCCategory(ObjCCategoryDecl *Cat,
-                                    ObjCCategoryDecl *Next) {
-      Cat->NextClassCategory = Next;
-    }
+  static void setNextObjCCategory(ObjCCategoryDecl *Cat,
+                                  ObjCCategoryDecl *Next) {
+    Cat->NextClassCategory = Next;
+  }
 
-    void VisitDecl(Decl *D);
-    void VisitPragmaCommentDecl(PragmaCommentDecl *D);
-    void VisitPragmaDetectMismatchDecl(PragmaDetectMismatchDecl *D);
-    void VisitTranslationUnitDecl(TranslationUnitDecl *TU);
-    void VisitNamedDecl(NamedDecl *ND);
-    void VisitLabelDecl(LabelDecl *LD);
-    void VisitNamespaceDecl(NamespaceDecl *D);
-    void VisitHLSLBufferDecl(HLSLBufferDecl *D);
-    void VisitUsingDirectiveDecl(UsingDirectiveDecl *D);
-    void VisitNamespaceAliasDecl(NamespaceAliasDecl *D);
-    void VisitTypeDecl(TypeDecl *TD);
-    RedeclarableResult VisitTypedefNameDecl(TypedefNameDecl *TD);
-    void VisitTypedefDecl(TypedefDecl *TD);
-    void VisitTypeAliasDecl(TypeAliasDecl *TD);
-    void VisitUnresolvedUsingTypenameDecl(UnresolvedUsingTypenameDecl *D);
-    void VisitUnresolvedUsingIfExistsDecl(UnresolvedUsingIfExistsDecl *D);
-    RedeclarableResult VisitTagDecl(TagDecl *TD);
-    void VisitEnumDecl(EnumDecl *ED);
-    RedeclarableResult VisitRecordDeclImpl(RecordDecl *RD);
-    void VisitRecordDecl(RecordDecl *RD);
-    RedeclarableResult VisitCXXRecordDeclImpl(CXXRecordDecl *D);
-    void VisitCXXRecordDecl(CXXRecordDecl *D) { VisitCXXRecordDeclImpl(D); }
-    RedeclarableResult VisitClassTemplateSpecializationDeclImpl(
-                                            ClassTemplateSpecializationDecl *D);
-
-    void VisitClassTemplateSpecializationDecl(
-        ClassTemplateSpecializationDecl *D) {
-      VisitClassTemplateSpecializationDeclImpl(D);
-    }
+  void VisitDecl(Decl *D);
+  void VisitPragmaCommentDecl(PragmaCommentDecl *D);
+  void VisitPragmaDetectMismatchDecl(PragmaDetectMismatchDecl *D);
+  void VisitTranslationUnitDecl(TranslationUnitDecl *TU);
+  void VisitNamedDecl(NamedDecl *ND);
+  void VisitLabelDecl(LabelDecl *LD);
+  void VisitNamespaceDecl(NamespaceDecl *D);
+  void VisitHLSLBufferDecl(HLSLBufferDecl *D);
+  void VisitUsingDirectiveDecl(UsingDirectiveDecl *D);
+  void VisitNamespaceAliasDecl(NamespaceAliasDecl *D);
+  void VisitTypeDecl(TypeDecl *TD);
+  RedeclarableResult VisitTypedefNameDecl(TypedefNameDecl *TD);
+  void VisitTypedefDecl(TypedefDecl *TD);
+  void VisitTypeAliasDecl(TypeAliasDecl *TD);
+  void VisitUnresolvedUsingTypenameDecl(UnresolvedUsingTypenameDecl *D);
+  void VisitUnresolvedUsingIfExistsDecl(UnresolvedUsingIfExistsDecl *D);
+  RedeclarableResult VisitTagDecl(TagDecl *TD);
+  void VisitEnumDecl(EnumDecl *ED);
+  RedeclarableResult VisitRecordDeclImpl(RecordDecl *RD);
+  void VisitRecordDecl(RecordDecl *RD);
+  RedeclarableResult VisitCXXRecordDeclImpl(CXXRecordDecl *D);
+  void VisitCXXRecordDecl(CXXRecordDecl *D) { VisitCXXRecordDeclImpl(D); }
+  RedeclarableResult
+  VisitClassTemplateSpecializationDeclImpl(ClassTemplateSpecializationDecl *D);
+
+  void
+  VisitClassTemplateSpecializationDecl(ClassTemplateSpecializationDecl *D) {
+    VisitClassTemplateSpecializationDeclImpl(D);
+  }
 
-    void VisitClassTemplatePartialSpecializationDecl(
-        ClassTemplatePartialSpecializationDecl *D);
-    RedeclarableResult
-    VisitVarTemplateSpecializationDeclImpl(VarTemplateSpecializationDecl *D);
+  void VisitClassTemplatePartialSpecializationDecl(
+      ClassTemplatePartialSpecializationDecl *D);
+  RedeclarableResult
+  VisitVarTemplateSpecializationDeclImpl(VarTemplateSpecializationDecl *D);
 
-    void VisitVarTemplateSpecializationDecl(VarTemplateSpecializationDecl *D) {
-      VisitVarTemplateSpecializationDeclImpl(D);
-    }
+  void VisitVarTemplateSpecializationDecl(VarTemplateSpecializationDecl *D) {
+    VisitVarTemplateSpecializationDeclImpl(D);
+  }
 
-    void VisitVarTemplatePartialSpecializationDecl(
-        VarTemplatePartialSpecializationDecl *D);
-    void VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D);
-    void VisitValueDecl(ValueDecl *VD);
-    void VisitEnumConstantDecl(EnumConstantDecl *ECD);
-    void VisitUnresolvedUsingValueDecl(UnresolvedUsingValueDecl *D);
-    void VisitDeclaratorDecl(DeclaratorDecl *DD);
-    void VisitFunctionDecl(FunctionDecl *FD);
-    void VisitCXXDeductionGuideDecl(CXXDeductionGuideDecl *GD);
-    void VisitCXXMethodDecl(CXXMethodDecl *D);
-    void VisitCXXConstructorDecl(CXXConstructorDecl *D);
-    void VisitCXXDestructorDecl(CXXDestructorDecl *D);
-    void VisitCXXConversionDecl(CXXConversionDecl *D);
-    void VisitFieldDecl(FieldDecl *FD);
-    void VisitMSPropertyDecl(MSPropertyDecl *FD);
-    void VisitMSGuidDecl(MSGuidDecl *D);
-    void VisitUnnamedGlobalConstantDecl(UnnamedGlobalConstantDecl *D);
-    void VisitTemplateParamObjectDecl(TemplateParamObjectDecl *D);
-    void VisitIndirectFieldDecl(IndirectFieldDecl *FD);
-    RedeclarableResult VisitVarDeclImpl(VarDecl *D);
-    void ReadVarDeclInit(VarDecl *VD);
-    void VisitVarDecl(VarDecl *VD) { VisitVarDeclImpl(VD); }
-    void VisitImplicitParamDecl(ImplicitParamDecl *PD);
-    void VisitParmVarDecl(ParmVarDecl *PD);
-    void VisitDecompositionDecl(DecompositionDecl *DD);
-    void VisitBindingDecl(BindingDecl *BD);
-    void VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D);
-    void VisitTemplateDecl(TemplateDecl *D);
-    void VisitConceptDecl(ConceptDecl *D);
-    void VisitImplicitConceptSpecializationDecl(
-        ImplicitConceptSpecializationDecl *D);
-    void VisitRequiresExprBodyDecl(RequiresExprBodyDecl *D);
-    RedeclarableResult VisitRedeclarableTemplateDecl(RedeclarableTemplateDecl *D);
-    void VisitClassTemplateDecl(ClassTemplateDecl *D);
-    void VisitBuiltinTemplateDecl(BuiltinTemplateDecl *D);
-    void VisitVarTemplateDecl(VarTemplateDecl *D);
-    void VisitFunctionTemplateDecl(FunctionTemplateDecl *D);
-    void VisitTemplateTemplateParmDecl(TemplateTemplateParmDecl *D);
-    void VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *D);
-    void VisitUsingDecl(UsingDecl *D);
-    void VisitUsingEnumDecl(UsingEnumDecl *D);
-    void VisitUsingPackDecl(UsingPackDecl *D);
-    void VisitUsingShadowDecl(UsingShadowDecl *D);
-    void VisitConstructorUsingShadowDecl(ConstructorUsingShadowDecl *D);
-    void VisitLinkageSpecDecl(LinkageSpecDecl *D);
-    void VisitExportDecl(ExportDecl *D);
-    void VisitFileScopeAsmDecl(FileScopeAsmDecl *AD);
-    void VisitTopLevelStmtDecl(TopLevelStmtDecl *D);
-    void VisitImportDecl(ImportDecl *D);
-    void VisitAccessSpecDecl(AccessSpecDecl *D);
-    void VisitFriendDecl(FriendDecl *D);
-    void VisitFriendTemplateDecl(FriendTemplateDecl *D);
-    void VisitStaticAssertDecl(StaticAssertDecl *D);
-    void VisitBlockDecl(BlockDecl *BD);
-    void VisitCapturedDecl(CapturedDecl *CD);
-    void VisitEmptyDecl(EmptyDecl *D);
-    void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D);
-
-    std::pair<uint64_t, uint64_t> VisitDeclContext(DeclContext *DC);
-
-    template<typename T>
-    RedeclarableResult VisitRedeclarable(Redeclarable<T> *D);
-
-    template <typename T>
-    void mergeRedeclarable(Redeclarable<T> *D, RedeclarableResult &Redecl);
-
-    void mergeLambda(CXXRecordDecl *D, RedeclarableResult &Redecl,
-                     Decl *Context, unsigned Number);
-
-    void mergeRedeclarableTemplate(RedeclarableTemplateDecl *D,
-                                   RedeclarableResult &Redecl);
-
-    template <typename T>
-    void mergeRedeclarable(Redeclarable<T> *D, T *Existing,
-                           RedeclarableResult &Redecl);
-
-    template<typename T>
-    void mergeMergeable(Mergeable<T> *D);
-
-    void mergeMergeable(LifetimeExtendedTemporaryDecl *D);
-
-    void mergeTemplatePattern(RedeclarableTemplateDecl *D,
-                              RedeclarableTemplateDecl *Existing,
-                              bool IsKeyDecl);
-
-    ObjCTypeParamList *ReadObjCTypeParamList();
-
-    // FIXME: Reorder according to DeclNodes.td?
-    void VisitObjCMethodDecl(ObjCMethodDecl *D);
-    void VisitObjCTypeParamDecl(ObjCTypeParamDecl *D);
-    void VisitObjCContainerDecl(ObjCContainerDecl *D);
-    void VisitObjCInterfaceDecl(ObjCInterfaceDecl *D);
-    void VisitObjCIvarDecl(ObjCIvarDecl *D);
-    void VisitObjCProtocolDecl(ObjCProtocolDecl *D);
-    void VisitObjCAtDefsFieldDecl(ObjCAtDefsFieldDecl *D);
-    void VisitObjCCategoryDecl(ObjCCategoryDecl *D);
-    void VisitObjCImplDecl(ObjCImplDecl *D);
-    void VisitObjCCategoryImplDecl(ObjCCategoryImplDecl *D);
-    void VisitObjCImplementationDecl(ObjCImplementationDecl *D);
-    void VisitObjCCompatibleAliasDecl(ObjCCompatibleAliasDecl *D);
-    void VisitObjCPropertyDecl(ObjCPropertyDecl *D);
-    void VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D);
-    void VisitOMPThreadPrivateDecl(OMPThreadPrivateDecl *D);
-    void VisitOMPAllocateDecl(OMPAllocateDecl *D);
-    void VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D);
-    void VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D);
-    void VisitOMPRequiresDecl(OMPRequiresDecl *D);
-    void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
+  void VisitVarTemplatePartialSpecializationDecl(
+      VarTemplatePartialSpecializationDecl *D);
+  void VisitTemplateTypeParmDecl(TemplateTypeParmDecl *D);
+  void VisitValueDecl(ValueDecl *VD);
+  void VisitEnumConstantDecl(EnumConstantDecl *ECD);
+  void VisitUnresolvedUsingValueDecl(UnresolvedUsingValueDecl *D);
+  void VisitDeclaratorDecl(DeclaratorDecl *DD);
+  void VisitFunctionDecl(FunctionDecl *FD);
+  void VisitCXXDeductionGuideDecl(CXXDeductionGuideDecl *GD);
+  void VisitCXXMethodDecl(CXXMethodDecl *D);
+  void VisitCXXConstructorDecl(CXXConstructorDecl *D);
+  void VisitCXXDestructorDecl(CXXDestructorDecl *D);
+  void VisitCXXConversionDecl(CXXConversionDecl *D);
+  void VisitFieldDecl(FieldDecl *FD);
+  void VisitMSPropertyDecl(MSPropertyDecl *FD);
+  void VisitMSGuidDecl(MSGuidDecl *D);
+  void VisitUnnamedGlobalConstantDecl(UnnamedGlobalConstantDecl *D);
+  void VisitTemplateParamObjectDecl(TemplateParamObjectDecl *D);
+  void VisitIndirectFieldDecl(IndirectFieldDecl *FD);
+  RedeclarableResult VisitVarDeclImpl(VarDecl *D);
+  void ReadVarDeclInit(VarDecl *VD);
+  void VisitVarDecl(VarDecl *VD) { VisitVarDeclImpl(VD); }
+  void VisitImplicitParamDecl(ImplicitParamDecl *PD);
+  void VisitParmVarDecl(ParmVarDecl *PD);
+  void VisitDecompositionDecl(DecompositionDecl *DD);
+  void VisitBindingDecl(BindingDecl *BD);
+  void VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D);
+  void VisitTemplateDecl(TemplateDecl *D);
+  void VisitConceptDecl(ConceptDecl *D);
+  void
+  VisitImplicitConceptSpecializationDecl(ImplicitConceptSpecializationDecl *D);
+  void VisitRequiresExprBodyDecl(RequiresExprBodyDecl *D);
+  RedeclarableResult VisitRedeclarableTemplateDecl(RedeclarableTemplateDecl *D);
+  void VisitClassTemplateDecl(ClassTemplateDecl *D);
+  void VisitBuiltinTemplateDecl(BuiltinTemplateDecl *D);
+  void VisitVarTemplateDecl(VarTemplateDecl *D);
+  void VisitFunctionTemplateDecl(FunctionTemplateDecl *D);
+  void VisitTemplateTemplateParmDecl(TemplateTemplateParmDecl *D);
+  void VisitTypeAliasTemplateDecl(TypeAliasTemplateDecl *D);
+  void VisitUsingDecl(UsingDecl *D);
+  void VisitUsingEnumDecl(UsingEnumDecl *D);
+  void VisitUsingPackDecl(UsingPackDecl *D);
+  void VisitUsingShadowDecl(UsingShadowDecl *D);
+  void VisitConstructorUsingShadowDecl(ConstructorUsingShadowDecl *D);
+  void VisitLinkageSpecDecl(LinkageSpecDecl *D);
+  void VisitExportDecl(ExportDecl *D);
+  void VisitFileScopeAsmDecl(FileScopeAsmDecl *AD);
+  void VisitTopLevelStmtDecl(TopLevelStmtDecl *D);
+  void VisitImportDecl(ImportDecl *D);
+  void VisitAccessSpecDecl(AccessSpecDecl *D);
+  void VisitFriendDecl(FriendDecl *D);
+  void VisitFriendTemplateDecl(FriendTemplateDecl *D);
+  void VisitStaticAssertDecl(StaticAssertDecl *D);
+  void VisitBlockDecl(BlockDecl *BD);
+  void VisitCapturedDecl(CapturedDecl *CD);
+  void VisitEmptyDecl(EmptyDecl *D);
+  void VisitLifetimeExtendedTemporaryDecl(LifetimeExtendedTemporaryDecl *D);
+
+  std::pair<uint64_t, uint64_t> VisitDeclContext(DeclContext *DC);
+
+  template <typename T>
+  RedeclarableResult VisitRedeclarable(Redeclarable<T> *D);
+
+  template <typename T>
+  void mergeRedeclarable(Redeclarable<T> *D, RedeclarableResult &Redecl);
+
+  void mergeRedeclarableTemplate(RedeclarableTemplateDecl *D,
+                                 RedeclarableResult &Redecl);
+
+  template <typename T> void mergeMergeable(Mergeable<T> *D);
+
+  void mergeMergeable(LifetimeExtendedTemporaryDecl *D);
+
+  ObjCTypeParamList *ReadObjCTypeParamList();
+
+  // FIXME: Reorder according to DeclNodes.td?
+  void VisitObjCMethodDecl(ObjCMethodDecl *D);
+  void VisitObjCTypeParamDecl(ObjCTypeParamDecl *D);
+  void VisitObjCContainerDecl(ObjCContainerDecl *D);
+  void VisitObjCInterfaceDecl(ObjCInterfaceDecl *D);
+  void VisitObjCIvarDecl(ObjCIvarDecl *D);
+  void VisitObjCProtocolDecl(ObjCProtocolDecl *D);
+  void VisitObjCAtDefsFieldDecl(ObjCAtDefsFieldDecl *D);
+  void VisitObjCCategoryDecl(ObjCCategoryDecl *D);
+  void VisitObjCImplDecl(ObjCImplDecl *D);
+  void VisitObjCCategoryImplDecl(ObjCCategoryImplDecl *D);
+  void VisitObjCImplementationDecl(ObjCImplementationDecl *D);
+  void VisitObjCCompatibleAliasDecl(ObjCCompatibleAliasDecl *D);
+  void VisitObjCPropertyDecl(ObjCPropertyDecl *D);
+  void VisitObjCPropertyImplDecl(ObjCPropertyImplDecl *D);
+  void VisitOMPThreadPrivateDecl(OMPThreadPrivateDecl *D);
+  void VisitOMPAllocateDecl(OMPAllocateDecl *D);
+  void VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D);
+  void VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D);
+  void VisitOMPRequiresDecl(OMPRequiresDecl *D);
+  void VisitOMPCapturedExprDecl(OMPCapturedExprDecl *D);
   };
 
-} // namespace clang
+  } // namespace clang
 
 namespace {
 
@@ -714,8 +719,7 @@ void ASTDeclReader::VisitTypeDecl(TypeDecl *TD) {
   DeferredTypeID = Record.getGlobalTypeID(Record.readInt());
 }
 
-ASTDeclReader::RedeclarableResult
-ASTDeclReader::VisitTypedefNameDecl(TypedefNameDecl *TD) {
+RedeclarableResult ASTDeclReader::VisitTypedefNameDecl(TypedefNameDecl *TD) {
   RedeclarableResult Redecl = VisitRedeclarable(TD);
   VisitTypeDecl(TD);
   TypeSourceInfo *TInfo = readTypeSourceInfo();
@@ -746,7 +750,7 @@ void ASTDeclReader::VisitTypeAliasDecl(TypeAliasDecl *TD) {
     mergeRedeclarable(TD, Redecl);
 }
 
-ASTDeclReader::RedeclarableResult ASTDeclReader::VisitTagDecl(TagDecl *TD) {
+RedeclarableResult ASTDeclReader::VisitTagDecl(TagDecl *TD) {
   RedeclarableResult Redecl = VisitRedeclarable(TD);
   VisitTypeDecl(TD);
 
@@ -837,8 +841,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
   }
 }
 
-ASTDeclReader::RedeclarableResult
-ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) {
+RedeclarableResult ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) {
   RedeclarableResult Redecl = VisitTagDecl(RD);
 
   BitsUnpacker RecordDeclBits(Record.readInt());
@@ -1116,7 +1119,7 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
   }
 
   if (Existing)
-    mergeRedeclarable(FD, Existing, Redecl);
+    MergeImpl.mergeRedeclarable(FD, Existing, Redecl);
   else if (auto Kind = FD->getTemplatedKind();
            Kind == FunctionDecl::TK_FunctionTemplate ||
            Kind == FunctionDecl::TK_FunctionTemplateSpecialization) {
@@ -1267,8 +1270,8 @@ void ASTDeclReader::ReadObjCDefinitionData(
                                   Reader.getContext());
 }
 
-void ASTDeclReader::MergeDefinitionData(ObjCInterfaceDecl *D,
-         struct ObjCInterfaceDecl::DefinitionData &&NewDD) {
+void ASTDeclMerger::MergeDefinitionData(
+    ObjCInterfaceDecl *D, struct ObjCInterfaceDecl::DefinitionData &&NewDD) {
   struct ObjCInterfaceDecl::DefinitionData &DD = D->data();
   if (DD.Definition == NewDD.Definition)
     return;
@@ -1298,7 +1301,7 @@ void ASTDeclReader::VisitObjCInterfaceDecl(ObjCInterfaceDecl *ID) {
     if (Canon->Data.getPointer()) {
       // If we already have a definition, keep the definition invariant and
       // merge the data.
-      MergeDefinitionData(Canon, std::move(ID->data()));
+      MergeImpl.MergeDefinitionData(Canon, std::move(ID->data()));
       ID->Data = Canon->Data;
     } else {
       // Set the definition data of the canonical declaration, so other
@@ -1378,7 +1381,7 @@ void ASTDeclReader::ReadObjCDefinitionData(
     Data.HasODRHash = true;
 }
 
-void ASTDeclReader::MergeDefinitionData(
+void ASTDeclMerger::MergeDefinitionData(
     ObjCProtocolDecl *D, struct ObjCProtocolDecl::DefinitionData &&NewDD) {
   struct ObjCProtocolDecl::DefinitionData &DD = D->data();
   if (DD.Definition == NewDD.Definition)
@@ -1408,7 +1411,7 @@ void ASTDeclReader::VisitObjCProtocolDecl(ObjCProtocolDecl *PD) {
     if (Canon->Data.getPointer()) {
       // If we already have a definition, keep the definition invariant and
       // merge the data.
-      MergeDefinitionData(Canon, std::move(PD->data()));
+      MergeImpl.MergeDefinitionData(Canon, std::move(PD->data()));
       PD->Data = Canon->Data;
     } else {
       // Set the definition data of the canonical declaration, so other
@@ -1594,7 +1597,7 @@ void ASTDeclReader::VisitIndirectFieldDecl(IndirectFieldDecl *FD) {
   mergeMergeable(FD);
 }
 
-ASTDeclReader::RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
+RedeclarableResult ASTDeclReader::VisitVarDeclImpl(VarDecl *VD) {
   RedeclarableResult Redecl = VisitRedeclarable(VD);
   VisitDeclaratorDecl(VD);
 
@@ -2050,7 +2053,7 @@ void ASTDeclReader::ReadCXXDefinitionData(
   }
 }
 
-void ASTDeclReader::MergeDefinitionData(
+void ASTDeclMerger::MergeDefinitionData(
     CXXRecordDecl *D, struct CXXRecordDecl::DefinitionData &&MergeDD) {
   assert(D->DefinitionData &&
          "merging class definition into non-definition");
@@ -2176,7 +2179,7 @@ void ASTDeclReader::ReadCXXRecordDefinition(CXXRecordDecl *D, bool Update,
   // happen either because we're reading an update record, or because we've
   // already done some merging. Either way, just merge into it.
   if (Canon->DefinitionData != DD) {
-    MergeDefinitionData(Canon, std::move(*DD));
+    MergeImpl.MergeDefinitionData(Canon, std::move(*DD));
     return;
   }
 
@@ -2190,8 +2193,7 @@ void ASTDeclReader::ReadCXXRecordDefinition(CXXRecordDecl *D, bool Update,
     Reader.PendingDefinitions.insert(D);
 }
 
-ASTDeclReader::RedeclarableResult
-ASTDeclReader::VisitCXXRecordDeclImpl(CXXRecordDecl *D) {
+RedeclarableResult ASTDeclReader::VisitCXXRecordDeclImpl(CXXRecordDecl *D) {
   RedeclarableResult Redecl = VisitRecordDeclImpl(D);
 
   ASTContext &C = Reader.getContext();
@@ -2241,7 +2243,12 @@ ASTDeclReader::VisitCXXRecordDeclImpl(CXXRecordDecl *D) {
     LambdaContext = readDecl();
     if (LambdaContext)
       IndexInLambdaContext = Record.readInt();
-    mergeLambda(D, Redecl, LambdaContext, IndexInLambdaContext);
+    if (LambdaContext)
+      MergeImpl.mergeLambda(D, Redecl, *LambdaContext, IndexInLambdaContext);
+    else
+      // If we don't have a mangling context, treat this like any other
+      // declaration.
+      mergeRedeclarable(D, Redecl);
     break;
   }
   }
@@ -2398,7 +2405,7 @@ void ASTDeclReader::VisitImplicitConceptSpecializationDecl(
 void ASTDeclReader::VisitRequiresExprBodyDecl(RequiresExprBodyDecl *D) {
 }
 
-ASTDeclReader::RedeclarableResult
+RedeclarableResult
 ASTDeclReader::VisitRedeclarableTemplateDecl(RedeclarableTemplateDecl *D) {
   RedeclarableResult Redecl = VisitRedeclarable(D);
 
@@ -2470,8 +2477,7 @@ void ASTDeclReader::VisitVarTemplateDecl(VarTemplateDecl *D) {
   }
 }
 
-ASTDeclReader::RedeclarableResult
-ASTDeclReader::VisitClassTemplateSpecializationDeclImpl(
+RedeclarableResult ASTDeclReader::VisitClassTemplateSpecializationDeclImpl(
     ClassTemplateSpecializationDecl *D) {
   RedeclarableResult Redecl = VisitCXXRecordDeclImpl(D);
 
@@ -2515,13 +2521,13 @@ ASTDeclReader::VisitClassTemplateSpecializationDeclImpl(
       }
       // If there was already a canonical specialization, merge into it.
       if (CanonSpec != D) {
-        mergeRedeclarable<TagDecl>(D, CanonSpec, Redecl);
+        MergeImpl.mergeRedeclarable<TagDecl>(D, CanonSpec, Redecl);
 
         // This declaration might be a definition. Merge with any existing
         // definition.
         if (auto *DDD = D->DefinitionData) {
           if (CanonSpec->DefinitionData)
-            MergeDefinitionData(CanonSpec, std::move(*DDD));
+            MergeImpl.MergeDefinitionData(CanonSpec, std::move(*DDD));
           else
             CanonSpec->DefinitionData = D->DefinitionData;
         }
@@ -2577,8 +2583,7 @@ void ASTDeclReader::VisitFunctionTemplateDecl(FunctionTemplateDecl *D) {
 ///        VarTemplate(Partial)SpecializationDecl with a new data
 ///        structure Template(Partial)SpecializationDecl, and
 ///        using Template(Partial)SpecializationDecl as input type.
-ASTDeclReader::RedeclarableResult
-ASTDeclReader::VisitVarTemplateSpecializationDeclImpl(
+RedeclarableResult ASTDeclReader::VisitVarTemplateSpecializationDeclImpl(
     VarTemplateSpecializationDecl *D) {
   ASTContext &C = Reader.getContext();
   if (Decl *InstD = readDecl()) {
@@ -2633,7 +2638,7 @@ ASTDeclReader::VisitVarTemplateSpecializationDeclImpl(
       }
       // If we already have a matching specialization, merge it.
       if (CanonSpec != D)
-        mergeRedeclarable<VarDecl>(D, CanonSpec, Redecl);
+        MergeImpl.mergeRedeclarable<VarDecl>(D, CanonSpec, Redecl);
     }
   }
 
@@ -2762,8 +2767,7 @@ ASTDeclReader::VisitDeclContext(DeclContext *DC) {
 }
 
 template <typename T>
-ASTDeclReader::RedeclarableResult
-ASTDeclReader::VisitRedeclarable(Redeclarable<T> *D) {
+RedeclarableResult ASTDeclReader::VisitRedeclarable(Redeclarable<T> *D) {
   GlobalDeclID FirstDeclID = readDeclID();
   Decl *MergeWith = nullptr;
 
@@ -2838,10 +2842,10 @@ void ASTDeclReader::mergeRedeclarable(Redeclarable<T> *DBase,
 
   if (auto *Existing = Redecl.getKnownMergeTarget())
     // We already know of an existing declaration we should merge with.
-    mergeRedeclarable(D, cast<T>(Existing), Redecl);
+    MergeImpl.mergeRedeclarable(D, cast<T>(Existing), Redecl);
   else if (FindExistingResult ExistingRes = findExisting(D))
     if (T *Existing = ExistingRes)
-      mergeRedeclarable(D, Existing, Redecl);
+      MergeImpl.mergeRedeclarable(D, Existing, Redecl);
 }
 
 /// Attempt to merge D with a previous declaration of the same lambda, which is
@@ -2850,13 +2854,8 @@ void ASTDeclReader::mergeRedeclarable(Redeclarable<T> *DBase,
 /// We can't look up lambdas in their enclosing lexical or semantic context in
 /// general, because for lambdas in variables, both of those might be a
 /// namespace or the translation unit.
-void ASTDeclReader::mergeLambda(CXXRecordDecl *D, RedeclarableResult &Redecl,
-                                Decl *Context, unsigned IndexInContext) {
-  // If we don't have a mangling context, treat this like any other
-  // declaration.
-  if (!Context)
-    return mergeRedeclarable(D, Redecl);
-
+void ASTDeclMerger::mergeLambda(CXXRecordDecl *D, RedeclarableResult &Redecl,
+                                Decl &Context, unsigned IndexInContext) {
   // If modules are not available, there is no reason to perform this merge.
   if (!Reader.getContext().getLangOpts().Modules)
     return;
@@ -2872,7 +2871,7 @@ void ASTDeclReader::mergeLambda(CXXRecordDecl *D, RedeclarableResult &Redecl,
   // Look up this lambda to see if we've seen it before. If so, merge with the
   // one we already loaded.
   NamedDecl *&Slot = Reader.LambdaDeclarationsForMerging[{
-      Context->getCanonicalDecl(), IndexInContext}];
+      Context.getCanonicalDecl(), IndexInContext}];
   if (Slot)
     mergeRedeclarable(D, cast<TagDecl>(Slot), Redecl);
   else
@@ -2898,7 +2897,7 @@ template<typename T> static T assert_cast(...) {
 
 /// Merge together the pattern declarations from two template
 /// declarations.
-void ASTDeclReader::mergeTemplatePattern(RedeclarableTemplateDecl *D,
+void ASTDeclMerger::mergeTemplatePattern(RedeclarableTemplateDecl *D,
                                          RedeclarableTemplateDecl *Existing,
                                          bool IsKeyDecl) {
   auto *DPattern = D->getTemplatedDecl();
@@ -2941,8 +2940,8 @@ void ASTDeclReader::mergeTemplatePattern(RedeclarableTemplateDecl *D,
 /// Attempts to merge the given declaration (D) with another declaration
 /// of the same entity.
 template <typename T>
-void ASTDeclReader::mergeRedeclarable(Redeclarable<T> *DBase, T *Existing,
-                                      RedeclarableResult &Redecl) {
+void ASTDeclMerger::mergeRedeclarableImpl(Redeclarable<T> *DBase, T *Existing,
+                                          GlobalDeclID KeyDeclID) {
   auto *D = static_cast<T *>(DBase);
   T *ExistingCanon = Existing->getCanonicalDecl();
   T *DCanon = D->getCanonicalDecl();
@@ -2955,15 +2954,17 @@ void ASTDeclReader::mergeRedeclarable(Redeclarable<T> *DBase, T *Existing,
     ExistingCanon->Used |= D->Used;
     D->Used = false;
 
+    bool IsKeyDecl = KeyDeclID.isValid();
+
     // When we merge a template, merge its pattern.
     if (auto *DTemplate = dyn_cast<RedeclarableTemplateDecl>(D))
       mergeTemplatePattern(
           DTemplate, assert_cast<RedeclarableTemplateDecl *>(ExistingCanon),
-          Redecl.isKeyDecl());
+          IsKeyDecl);
 
     // If this declaration is a key declaration, make a note of that.
-    if (Redecl.isKeyDecl())
-      Reader.KeyDecls[ExistingCanon].push_back(Redecl.getFirstID());
+    if (IsKeyDecl)
+      Reader.KeyDecls[ExistingCanon].push_back(KeyDeclID);
   }
 }
 
@@ -3533,8 +3534,8 @@ Decl *ASTReader::getMostRecentExistingDecl(Decl *D) {
   return ASTDeclReader::getMostRecentDecl(D->getCanonicalDecl());
 }
 
-void ASTDeclReader::mergeInheritableAttributes(ASTReader &Reader, Decl *D,
-                                               Decl *Previous) {
+namespace {
+void mergeInheritableAttributes(ASTReader &Reader, Decl *D, Decl *Previous) {
   InheritableAttr *NewAttr = nullptr;
   ASTContext &Context = Reader.getContext();
   const auto *IA = Previous->getAttr<MSInheritanceAttr>();
@@ -3552,6 +3553,7 @@ void ASTDeclReader::mergeInheritableAttributes(ASTReader &Reader, Decl *D,
     D->addAttr(NewAttr);
   }
 }
+} // namespace
 
 template<typename DeclT>
 void ASTDeclReader::attachPreviousDeclImpl(ASTReader &Reader,
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
index 0cab17a3424406..4d738e4bea41a6 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -201,11 +201,8 @@ const CachedRealPath &DependencyScanningFilesystemSharedCache::CacheShard::
   return *StoredRealPath;
 }
 
-static bool shouldCacheStatFailures(StringRef Filename) {
-  StringRef Ext = llvm::sys::path::extension(Filename);
-  if (Ext.empty())
-    return false; // This may be the module cache directory.
-  return true;
+bool DependencyScanningWorkerFilesystem::shouldBypass(StringRef Path) const {
+  return BypassedPathPrefix && Path.starts_with(*BypassedPathPrefix);
 }
 
 DependencyScanningWorkerFilesystem::DependencyScanningWorkerFilesystem(
@@ -244,8 +241,6 @@ DependencyScanningWorkerFilesystem::computeAndStoreResult(
   llvm::ErrorOr<llvm::vfs::Status> Stat =
       getUnderlyingFS().status(OriginalFilename);
   if (!Stat) {
-    if (!shouldCacheStatFailures(OriginalFilename))
-      return Stat.getError();
     const auto &Entry =
         getOrEmplaceSharedEntryForFilename(FilenameForLookup, Stat.getError());
     return insertLocalEntryForFilename(FilenameForLookup, Entry);
@@ -291,7 +286,7 @@ DependencyScanningWorkerFilesystem::status(const Twine &Path) {
   SmallString<256> OwnedFilename;
   StringRef Filename = Path.toStringRef(OwnedFilename);
 
-  if (Filename.ends_with(".pcm"))
+  if (shouldBypass(Filename))
     return getUnderlyingFS().status(Path);
 
   llvm::ErrorOr<EntryRef> Result = getOrCreateFileSystemEntry(Filename);
@@ -362,7 +357,7 @@ DependencyScanningWorkerFilesystem::openFileForRead(const Twine &Path) {
   SmallString<256> OwnedFilename;
   StringRef Filename = Path.toStringRef(OwnedFilename);
 
-  if (Filename.ends_with(".pcm"))
+  if (shouldBypass(Filename))
     return getUnderlyingFS().openFileForRead(Path);
 
   llvm::ErrorOr<EntryRef> Result = getOrCreateFileSystemEntry(Filename);
@@ -377,6 +372,9 @@ DependencyScanningWorkerFilesystem::getRealPath(const Twine &Path,
   SmallString<256> OwnedFilename;
   StringRef OriginalFilename = Path.toStringRef(OwnedFilename);
 
+  if (shouldBypass(OriginalFilename))
+    return getUnderlyingFS().getRealPath(Path, Output);
+
   SmallString<256> PathBuf;
   auto FilenameForLookup = tryGetFilenameForLookup(OriginalFilename, PathBuf);
   if (!FilenameForLookup)
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 91842627b001c2..1a21a4f5e30ff8 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -345,6 +345,26 @@ class DependencyScanningAction : public tooling::ToolAction {
         ScanInstance.getInvocation(), ScanInstance.getDiagnostics(),
         DriverFileMgr->getVirtualFileSystemPtr());
 
+    // Use the dependency scanning optimized file system if requested to do so.
+    if (DepFS) {
+      StringRef ModulesCachePath =
+          ScanInstance.getHeaderSearchOpts().ModuleCachePath;
+
+      DepFS->resetBypassedPathPrefix();
+      if (!ModulesCachePath.empty())
+        DepFS->setBypassedPathPrefix(ModulesCachePath);
+
+      ScanInstance.getPreprocessorOpts().DependencyDirectivesForFile =
+          [LocalDepFS = DepFS](FileEntryRef File)
+          -> std::optional<ArrayRef<dependency_directives_scan::Directive>> {
+        if (llvm::ErrorOr<EntryRef> Entry =
+                LocalDepFS->getOrCreateFileSystemEntry(File.getName()))
+          if (LocalDepFS->ensureDirectiveTokensArePopulated(*Entry))
+            return Entry->getDirectiveTokens();
+        return std::nullopt;
+      };
+    }
+
     // Create a new FileManager to match the invocation's FileSystemOptions.
     auto *FileMgr = ScanInstance.createFileManager(FS);
     ScanInstance.createSourceManager(*FileMgr);
@@ -361,18 +381,6 @@ class DependencyScanningAction : public tooling::ToolAction {
               PrebuiltModuleVFSMap, ScanInstance.getDiagnostics()))
         return false;
 
-    // Use the dependency scanning optimized file system if requested to do so.
-    if (DepFS)
-      ScanInstance.getPreprocessorOpts().DependencyDirectivesForFile =
-          [LocalDepFS = DepFS](FileEntryRef File)
-          -> std::optional<ArrayRef<dependency_directives_scan::Directive>> {
-        if (llvm::ErrorOr<EntryRef> Entry =
-                LocalDepFS->getOrCreateFileSystemEntry(File.getName()))
-          if (LocalDepFS->ensureDirectiveTokensArePopulated(*Entry))
-            return Entry->getDirectiveTokens();
-        return std::nullopt;
-      };
-
     // Create the dependency collector that will collect the produced
     // dependencies.
     //
diff --git a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
index b0eead42869a41..f4f9c7a244e0a3 100644
--- a/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
+++ b/clang/test/APINotes/Inputs/Headers/SwiftImportAs.apinotes
@@ -7,7 +7,9 @@ Tags:
   SwiftImportAs: reference
   SwiftReleaseOp: RCRelease
   SwiftRetainOp: RCRetain
+  SwiftConformsTo: MySwiftModule.MySwiftRefCountedProtocol
 - Name: NonCopyableType
   SwiftCopyable: false
+  SwiftConformsTo: MySwiftModule.MySwiftNonCopyableProtocol
 - Name: CopyableType
   SwiftCopyable: true
diff --git a/clang/test/APINotes/swift-import-as.cpp b/clang/test/APINotes/swift-import-as.cpp
index 62e6450e94e113..6457e1557618de 100644
--- a/clang/test/APINotes/swift-import-as.cpp
+++ b/clang/test/APINotes/swift-import-as.cpp
@@ -16,9 +16,11 @@
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "import_reference"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "retain:RCRetain"
 // CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "release:RCRelease"
+// CHECK-REF-COUNTED: SwiftAttrAttr {{.+}} <<invalid sloc>> "conforms_to:MySwiftModule.MySwiftRefCountedProtocol"
 
 // CHECK-NON-COPYABLE: Dumping NonCopyableType:
 // CHECK-NON-COPYABLE-NEXT: CXXRecordDecl {{.+}} imported in SwiftImportAs {{.+}} struct NonCopyableType
+// CHECK-NON-COPYABLE: SwiftAttrAttr {{.+}} <<invalid sloc>> "conforms_to:MySwiftModule.MySwiftNonCopyableProtocol"
 // CHECK-NON-COPYABLE: SwiftAttrAttr {{.+}} <<invalid sloc>> "~Copyable"
 
 // CHECK-COPYABLE: Dumping CopyableType:
diff --git a/clang/test/AST/Interp/constexpr-frame-describe.cpp b/clang/test/AST/Interp/constexpr-frame-describe.cpp
index e039fd61ae9812..a0ae046fc01786 100644
--- a/clang/test/AST/Interp/constexpr-frame-describe.cpp
+++ b/clang/test/AST/Interp/constexpr-frame-describe.cpp
@@ -81,3 +81,18 @@ static_assert(bar.fail2<int*, 42>()); // both-error {{constant expression}} \
 static_assert(bar.fail3(3, 4UL, bar, &bar)); // both-error {{constant expression}} \
                                              // expected-note {{in call to 'bar.fail3<int, unsigned long, Bar<int>, const Bar<int> *>(3, 4, &bar, &bar)'}} \
                                              // ref-note {{in call to 'bar.fail3<int, unsigned long, Bar<int>, const Bar<int> *>(3, 4, {}, &bar)'}}
+
+
+
+/// FIXME: Bound member pointer printing doesn't work right, see the last parameter to MemPtr().
+struct MemPtrTest {
+  int n;
+  void f();
+};
+MemPtrTest mpt; // both-note {{here}}
+constexpr int MemPtr(int (MemPtrTest::*a), void (MemPtrTest::*b)(), int &c) {
+  return c; // both-note {{read of non-constexpr variable 'mpt'}}
+}
+static_assert(MemPtr(&MemPtrTest::n, &MemPtrTest::f, mpt.*&MemPtrTest::n), ""); // both-error {{constant expression}} \
+                                                                                // expected-note {{in call to 'MemPtr(&MemPtrTest::n, &MemPtrTest::f, mpt)'}} \
+                                                                                // ref-note {{in call to 'MemPtr(&MemPtrTest::n, &MemPtrTest::f, mpt.n)'}}
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index 389d9d883725f4..77a967d42c4efe 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -859,7 +859,6 @@ namespace DefinitionLoc {
                                           // both-note {{non-constexpr constructor}}
 }
 
-/// FIXME: Call base dtors when explicitly calling dtor.
 namespace VirtDtor {
   class B {
   public:
@@ -900,5 +899,5 @@ namespace VirtDtor {
     return buff[0] == a && buff[1] == b;
   }
 
-  static_assert(test('C', 'B')); // expected-error {{failed}}
+  static_assert(test('C', 'B'));
 }
diff --git a/clang/test/AST/Interp/new-delete.cpp b/clang/test/AST/Interp/new-delete.cpp
index 325ce27c6d51da..6bb30bc19f110c 100644
--- a/clang/test/AST/Interp/new-delete.cpp
+++ b/clang/test/AST/Interp/new-delete.cpp
@@ -514,8 +514,7 @@ namespace DeleteRunsDtors {
   static_assert(abc2() == 1);
 }
 
-/// FIXME: There is a slight difference in diagnostics here, because we don't
-/// create a new frame when we delete record fields or bases at all.
+/// FIXME: There is a slight difference in diagnostics here.
 namespace FaultyDtorCalledByDelete {
   struct InnerFoo {
     int *mem;
@@ -536,7 +535,7 @@ namespace FaultyDtorCalledByDelete {
       a = new int(13);
       IF.mem = new int(100);
     }
-    constexpr ~Foo() { delete a; }
+    constexpr ~Foo() { delete a; } // expected-note {{in call to}}
   };
 
   constexpr int abc() {
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index e620bf9e0e041e..f51f9771b38aaa 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1595,4 +1595,35 @@ namespace VirtDtor {
   }
   static_assert(virt_dtor(0, "ZYX"));
 }
+
+namespace DtorDestroysFieldsAfterSelf {
+    struct  S {
+      int a = 10;
+      constexpr ~S() {
+        a = 0;
+      }
+
+    };
+    struct F {
+      S s;
+      int a;
+      int &b;
+      constexpr F(int a, int &b) : a(a), b(b) {}
+      constexpr ~F() {
+        b += s.a;
+      }
+    };
+
+  constexpr int foo() {
+    int a = 10;
+    int b = 5;
+    {
+      F f(a, b);
+    }
+
+    return b;
+  }
+
+  static_assert(foo() == 15);
+}
 #endif
diff --git a/clang/test/AST/atomic-expr.cpp b/clang/test/AST/atomic-expr.cpp
index bdb7bcb00569a1..d9d632ffc5917b 100644
--- a/clang/test/AST/atomic-expr.cpp
+++ b/clang/test/AST/atomic-expr.cpp
@@ -25,14 +25,14 @@ void useage(){
 }
 
 // CHECK:FunctionTemplateDecl 0x{{[0-9a-f]+}} <{{[^,]+}}, line:{{.*}}:1> line:{{.*}}:6 pr43370
-// CHECK: AtomicExpr
+// CHECK: AtomicExpr 0x{{[0-9a-f]+}} <{{.*}}> 'void' __atomic_store_n
 // CHECK-NEXT: ImplicitCastExpr
 // CHECK-SAME: <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <{{[^:]+}}:20> 'int[2]' lvalue Var 0x{{[0-9a-f]+}} 'arr' 'int[2]'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-f]+}} <{{[^:]+}}:28> 'int' 5
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-f]+}} <{{[^:]+}}:25> 'int' 0
 // CHECK:FunctionDecl 0x{{[0-9a-f]+}} <line:{{.*}}:1, line:{{.*}}:1> line:{{.*}}:6 used pr43370
-// CHECK: AtomicExpr
+// CHECK: AtomicExpr 0x{{[0-9a-f]+}} <{{.*}}> 'void' __atomic_store_n
 // CHECK-NEXT: ImplicitCastExpr
 // CHECK-SAME: <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <{{[^:]+}}:20> 'int[2]' lvalue Var 0x{{[0-9a-f]+}} 'arr' 'int[2]'
@@ -40,7 +40,7 @@ void useage(){
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-f]+}} <{{[^:]+}}:25> 'int' 0
 
 // CHECK:FunctionTemplateDecl 0x{{[0-9a-f]+}} <line:{{.*}}:1, line:{{.*}}:1> line:{{.*}}:6 foo
-// CHECK: AtomicExpr
+// CHECK: AtomicExpr 0x{{[0-9a-f]+}} <{{.*}}> 'bool' __atomic_compare_exchange_n
 // CHECK-NEXT: ImplicitCastExpr
 // CHECK-SAME: <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <{{[^:]+}}:37> 'int[2]' lvalue Var 0x{{[0-9a-f]+}} 'arr' 'int[2]'
@@ -53,7 +53,7 @@ void useage(){
 // CHECK-NEXT: ImplicitCastExpr
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-f]+}} <{{[^:]+}}:50> 'int' 0
 // CHECK:FunctionDecl 0x{{[0-9a-f]+}} <line:{{.*}}:1, line:{{.*}}:1> line:{{.*}}:6 used foo
-// CHECK: AtomicExpr
+// CHECK: AtomicExpr 0x{{[0-9a-f]+}} <{{.*}}> 'bool' __atomic_compare_exchange_n
 // CHECK-NEXT: ImplicitCastExpr
 // CHECK-SAME: <ArrayToPointerDecay>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <{{[^:]+}}:37> 'int[2]' lvalue Var 0x{{[0-9a-f]+}} 'arr' 'int[2]'
diff --git a/clang/test/CodeGenCUDA/Inputs/cuda.h b/clang/test/CodeGenCUDA/Inputs/cuda.h
index 8df425b77ac823..dc760500e65d41 100644
--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@@ -46,6 +46,11 @@ extern "C" hipError_t hipLaunchKernel_spt(const void *func, dim3 gridDim,
                                       size_t sharedMem,
                                       hipStream_t stream);
 #endif // __HIP_API_PER_THREAD_DEFAULT_STREAM__
+#elif __OFFLOAD_VIA_LLVM__
+extern "C" unsigned __llvmPushCallConfiguration(dim3 gridDim, dim3 blockDim,
+                                     size_t sharedMem = 0, void *stream = 0);
+extern "C" unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void **args, size_t sharedMem = 0, void *stream = 0);
 #else
 typedef struct cudaStream *cudaStream_t;
 typedef enum cudaError {} cudaError_t;
diff --git a/clang/test/CodeGenCUDA/offload_via_llvm.cu b/clang/test/CodeGenCUDA/offload_via_llvm.cu
new file mode 100644
index 00000000000000..434eba99c1795d
--- /dev/null
+++ b/clang/test/CodeGenCUDA/offload_via_llvm.cu
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -foffload-via-llvm -emit-llvm -o - | FileCheck %s --check-prefix=HST
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -foffload-via-llvm -emit-llvm -o - | FileCheck %s --check-prefix=DEV
+
+// Check that we generate LLVM/Offload calls, including the KERNEL_LAUNCH_PARAMS argument.
+
+#define __OFFLOAD_VIA_LLVM__ 1
+#include "Inputs/cuda.h"
+
+// HST-LABEL: define dso_local void @_Z18__device_stub__fooisPvS_(
+// HST-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+// HST-NEXT:  [[ENTRY:.*:]]
+// HST-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// HST-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// HST-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 4
+// HST-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 4
+// HST-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[TMP0]], align 16
+// HST-NEXT:    [[KERNEL_LAUNCH_PARAMS:%.*]] = alloca [[TMP1]], align 16
+// HST-NEXT:    [[GRID_DIM:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 8
+// HST-NEXT:    [[BLOCK_DIM:%.*]] = alloca [[STRUCT_DIM3]], align 8
+// HST-NEXT:    [[SHMEM_SIZE:%.*]] = alloca i32, align 4
+// HST-NEXT:    [[STREAM:%.*]] = alloca ptr, align 4
+// HST-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// HST-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// HST-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
+// HST-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
+// HST-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 0
+// HST-NEXT:    store i64 16, ptr [[TMP4]], align 16
+// HST-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 1
+// HST-NEXT:    store ptr [[KERNEL_ARGS]], ptr [[TMP5]], align 8
+// HST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[TMP1]], ptr [[KERNEL_LAUNCH_PARAMS]], i32 0, i32 2
+// HST-NEXT:    store ptr null, ptr [[TMP6]], align 4
+// HST-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTADDR]], align 4
+// HST-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 0
+// HST-NEXT:    store i32 [[TMP7]], ptr [[TMP8]], align 16
+// HST-NEXT:    [[TMP9:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// HST-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 1
+// HST-NEXT:    store i16 [[TMP9]], ptr [[TMP10]], align 4
+// HST-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
+// HST-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 2
+// HST-NEXT:    store ptr [[TMP11]], ptr [[TMP12]], align 8
+// HST-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTADDR3]], align 4
+// HST-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [[TMP0]], ptr [[KERNEL_ARGS]], i32 0, i32 3
+// HST-NEXT:    store ptr [[TMP13]], ptr [[TMP14]], align 4
+// HST-NEXT:    [[TMP15:%.*]] = call i32 @__llvmPopCallConfiguration(ptr [[GRID_DIM]], ptr [[BLOCK_DIM]], ptr [[SHMEM_SIZE]], ptr [[STREAM]])
+// HST-NEXT:    [[TMP16:%.*]] = load i32, ptr [[SHMEM_SIZE]], align 4
+// HST-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[STREAM]], align 4
+// HST-NEXT:    [[CALL:%.*]] = call noundef i32 @llvmLaunchKernel(ptr noundef @_Z18__device_stub__fooisPvS_, ptr noundef byval([[STRUCT_DIM3]]) align 4 [[GRID_DIM]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[BLOCK_DIM]], ptr noundef [[KERNEL_LAUNCH_PARAMS]], i32 noundef [[TMP16]], ptr noundef [[TMP17]])
+// HST-NEXT:    br label %[[SETUP_END:.*]]
+// HST:       [[SETUP_END]]:
+// HST-NEXT:    ret void
+//
+// DEV-LABEL: define dso_local void @_Z3fooisPvS_(
+// DEV-SAME: i32 noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], ptr noundef [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] {
+// DEV-NEXT:  [[ENTRY:.*:]]
+// DEV-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
+// DEV-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
+// DEV-NEXT:    [[DOTADDR2:%.*]] = alloca ptr, align 4
+// DEV-NEXT:    [[DOTADDR3:%.*]] = alloca ptr, align 4
+// DEV-NEXT:    store i32 [[TMP0]], ptr [[DOTADDR]], align 4
+// DEV-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// DEV-NEXT:    store ptr [[TMP2]], ptr [[DOTADDR2]], align 4
+// DEV-NEXT:    store ptr [[TMP3]], ptr [[DOTADDR3]], align 4
+// DEV-NEXT:    ret void
+//
+__global__ void foo(int, short, void *, void *) {}
+
+// HST-LABEL: define dso_local void @_Z5test1Pv(
+// HST-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+// HST-NEXT:  [[ENTRY:.*:]]
+// HST-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 4
+// HST-NEXT:    [[AGG_TMP:%.*]] = alloca [[STRUCT_DIM3:%.*]], align 4
+// HST-NEXT:    [[AGG_TMP1:%.*]] = alloca [[STRUCT_DIM3]], align 4
+// HST-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
+// HST-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP]], i32 noundef 3, i32 noundef 1, i32 noundef 1)
+// HST-NEXT:    call void @_ZN4dim3C1Ejjj(ptr noundef nonnull align 4 dereferenceable(12) [[AGG_TMP1]], i32 noundef 7, i32 noundef 1, i32 noundef 1)
+// HST-NEXT:    [[CALL:%.*]] = call i32 @__llvmPushCallConfiguration(ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP]], ptr noundef byval([[STRUCT_DIM3]]) align 4 [[AGG_TMP1]], i32 noundef 0, ptr noundef null)
+// HST-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[CALL]], 0
+// HST-NEXT:    br i1 [[TOBOOL]], label %[[KCALL_END:.*]], label %[[KCALL_CONFIGOK:.*]]
+// HST:       [[KCALL_CONFIGOK]]:
+// HST-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
+// HST-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[PTR_ADDR]], align 4
+// HST-NEXT:    call void @_Z18__device_stub__fooisPvS_(i32 noundef 13, i16 noundef signext 1, ptr noundef [[TMP0]], ptr noundef [[TMP1]]) #[[ATTR3:[0-9]+]]
+// HST-NEXT:    br label %[[KCALL_END]]
+// HST:       [[KCALL_END]]:
+// HST-NEXT:    ret void
+//
+void test1(void *Ptr) {
+  foo<<<3, 7>>>(13, 1, Ptr, Ptr);
+}
diff --git a/clang/test/CodeGenCXX/x86_64-vaarg.cpp b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
index 439a66deb4fde7..c7ae88efb392e2 100644
--- a/clang/test/CodeGenCXX/x86_64-vaarg.cpp
+++ b/clang/test/CodeGenCXX/x86_64-vaarg.cpp
@@ -29,6 +29,7 @@ typedef struct {
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_S1:%.*]], align 8
 // CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S1]], align 8
 // CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
 // CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
 // CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
@@ -41,8 +42,11 @@ typedef struct {
 // CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 3
 // CHECK-NEXT:    [[REG_SAVE_AREA:%.*]] = load ptr, ptr [[TMP0]], align 16
 // CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i32 [[FP_OFFSET]]
-// CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[FP_OFFSET]], 16
-// CHECK-NEXT:    store i32 [[TMP2]], ptr [[FP_OFFSET_P]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP]], i32 8
+// CHECK-NEXT:    store double [[TMP2]], ptr [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[FP_OFFSET]], 16
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[FP_OFFSET_P]], align 4
 // CHECK-NEXT:    br label [[VAARG_END:%.*]]
 // CHECK:       vaarg.in_mem:
 // CHECK-NEXT:    [[OVERFLOW_ARG_AREA_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 2
@@ -51,14 +55,257 @@ typedef struct {
 // CHECK-NEXT:    store ptr [[OVERFLOW_ARG_AREA_NEXT]], ptr [[OVERFLOW_ARG_AREA_P]], align 8
 // CHECK-NEXT:    br label [[VAARG_END]]
 // CHECK:       vaarg.end:
-// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP1]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
+// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[VAARG_ADDR]], i64 16, i1 false)
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[RETVAL]], i64 8
-// CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[TMP3]], align 8
-// CHECK-NEXT:    ret double [[TMP4]]
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8
+// CHECK-NEXT:    ret double [[TMP6]]
 //
 s1 f(int z, ...) {
   __builtin_va_list list;
   __builtin_va_start(list, z);
   return __builtin_va_arg(list, s1);
 }
+
+typedef struct {
+  struct{} a[5];
+  float b;
+  float c;
+} s2;
+
+// CHECK-LABEL: @_Z2f2iz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_S2:%.*]], align 4
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S2]], align 4
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    [[FP_OFFSET_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 1
+// CHECK-NEXT:    [[FP_OFFSET:%.*]] = load i32, ptr [[FP_OFFSET_P]], align 4
+// CHECK-NEXT:    [[FITS_IN_FP:%.*]] = icmp ule i32 [[FP_OFFSET]], 160
+// CHECK-NEXT:    br i1 [[FITS_IN_FP]], label [[VAARG_IN_REG:%.*]], label [[VAARG_IN_MEM:%.*]]
+// CHECK:       vaarg.in_reg:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 3
+// CHECK-NEXT:    [[REG_SAVE_AREA:%.*]] = load ptr, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i32 [[FP_OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP]], i32 8
+// CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[FP_OFFSET]], 16
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[FP_OFFSET_P]], align 4
+// CHECK-NEXT:    br label [[VAARG_END:%.*]]
+// CHECK:       vaarg.in_mem:
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 2
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA:%.*]] = load ptr, ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_NEXT:%.*]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i32 16
+// CHECK-NEXT:    store ptr [[OVERFLOW_ARG_AREA_NEXT]], ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    br label [[VAARG_END]]
+// CHECK:       vaarg.end:
+// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RETVAL]], ptr align 4 [[VAARG_ADDR]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, ptr [[TMP5]], align 4
+// CHECK-NEXT:    ret <2 x float> [[TMP6]]
+//
+s2 f2(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, s2);
+}
+
+typedef struct {
+  struct{} a;
+  long long b;
+} s3;
+
+// CHECK-LABEL: @_Z2f3iz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_S3:%.*]], align 8
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S3]], align 8
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    [[GP_OFFSET_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 0
+// CHECK-NEXT:    [[GP_OFFSET:%.*]] = load i32, ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    [[FITS_IN_GP:%.*]] = icmp ule i32 [[GP_OFFSET]], 40
+// CHECK-NEXT:    br i1 [[FITS_IN_GP]], label [[VAARG_IN_REG:%.*]], label [[VAARG_IN_MEM:%.*]]
+// CHECK:       vaarg.in_reg:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 3
+// CHECK-NEXT:    [[REG_SAVE_AREA:%.*]] = load ptr, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i32 [[GP_OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP]], i32 8
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[GP_OFFSET]], 8
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    br label [[VAARG_END:%.*]]
+// CHECK:       vaarg.in_mem:
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 2
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA:%.*]] = load ptr, ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_NEXT:%.*]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i32 16
+// CHECK-NEXT:    store ptr [[OVERFLOW_ARG_AREA_NEXT]], ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    br label [[VAARG_END]]
+// CHECK:       vaarg.end:
+// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[VAARG_ADDR]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+s3 f3(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, s3);
+}
+
+typedef struct {
+  struct{} a[7];
+  short b;
+  int c;
+} s4;
+
+// CHECK-LABEL: @_Z2f4iz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_S4:%.*]], align 4
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S4]], align 4
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    [[GP_OFFSET_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 0
+// CHECK-NEXT:    [[GP_OFFSET:%.*]] = load i32, ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    [[FITS_IN_GP:%.*]] = icmp ule i32 [[GP_OFFSET]], 40
+// CHECK-NEXT:    br i1 [[FITS_IN_GP]], label [[VAARG_IN_REG:%.*]], label [[VAARG_IN_MEM:%.*]]
+// CHECK:       vaarg.in_reg:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 3
+// CHECK-NEXT:    [[REG_SAVE_AREA:%.*]] = load ptr, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i32 [[GP_OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP]], i32 8
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[GP_OFFSET]], 8
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    br label [[VAARG_END:%.*]]
+// CHECK:       vaarg.in_mem:
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 2
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA:%.*]] = load ptr, ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_NEXT:%.*]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i32 16
+// CHECK-NEXT:    store ptr [[OVERFLOW_ARG_AREA_NEXT]], ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    br label [[VAARG_END]]
+// CHECK:       vaarg.end:
+// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RETVAL]], ptr align 4 [[VAARG_ADDR]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+s4 f4(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, s4);
+}
+
+typedef struct {
+  struct{} a[5];
+  float b;
+  int c;
+} s5;
+
+// CHECK-LABEL: @_Z2f5iz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_S5:%.*]], align 4
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S5]], align 4
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    [[GP_OFFSET_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 0
+// CHECK-NEXT:    [[GP_OFFSET:%.*]] = load i32, ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    [[FITS_IN_GP:%.*]] = icmp ule i32 [[GP_OFFSET]], 40
+// CHECK-NEXT:    br i1 [[FITS_IN_GP]], label [[VAARG_IN_REG:%.*]], label [[VAARG_IN_MEM:%.*]]
+// CHECK:       vaarg.in_reg:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 3
+// CHECK-NEXT:    [[REG_SAVE_AREA:%.*]] = load ptr, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i32 [[GP_OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP]], i32 8
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[GP_OFFSET]], 8
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    br label [[VAARG_END:%.*]]
+// CHECK:       vaarg.in_mem:
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 2
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA:%.*]] = load ptr, ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_NEXT:%.*]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i32 16
+// CHECK-NEXT:    store ptr [[OVERFLOW_ARG_AREA_NEXT]], ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    br label [[VAARG_END]]
+// CHECK:       vaarg.end:
+// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RETVAL]], ptr align 4 [[VAARG_ADDR]], i64 16, i1 false)
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[RETVAL]], i64 8
+// CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 4
+// CHECK-NEXT:    ret i64 [[TMP6]]
+//
+s5 f5(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, s5);
+}
+
+typedef struct {
+  long long a;
+  struct{} b;
+} s6;
+
+// CHECK-LABEL: @_Z2f6iz(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_S6:%.*]], align 8
+// CHECK-NEXT:    [[Z_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[LIST:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_S6]], align 8
+// CHECK-NEXT:    store i32 [[Z:%.*]], ptr [[Z_ADDR]], align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr [[ARRAYDECAY]])
+// CHECK-NEXT:    [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1 x %struct.__va_list_tag], ptr [[LIST]], i64 0, i64 0
+// CHECK-NEXT:    [[GP_OFFSET_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG:%.*]], ptr [[ARRAYDECAY1]], i32 0, i32 0
+// CHECK-NEXT:    [[GP_OFFSET:%.*]] = load i32, ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    [[FITS_IN_GP:%.*]] = icmp ule i32 [[GP_OFFSET]], 40
+// CHECK-NEXT:    br i1 [[FITS_IN_GP]], label [[VAARG_IN_REG:%.*]], label [[VAARG_IN_MEM:%.*]]
+// CHECK:       vaarg.in_reg:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 3
+// CHECK-NEXT:    [[REG_SAVE_AREA:%.*]] = load ptr, ptr [[TMP0]], align 16
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[REG_SAVE_AREA]], i32 [[GP_OFFSET]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP]], i32 0
+// CHECK-NEXT:    store i64 [[TMP2]], ptr [[TMP3]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[GP_OFFSET]], 8
+// CHECK-NEXT:    store i32 [[TMP4]], ptr [[GP_OFFSET_P]], align 16
+// CHECK-NEXT:    br label [[VAARG_END:%.*]]
+// CHECK:       vaarg.in_mem:
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_P:%.*]] = getelementptr inbounds nuw [[STRUCT___VA_LIST_TAG]], ptr [[ARRAYDECAY1]], i32 0, i32 2
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA:%.*]] = load ptr, ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    [[OVERFLOW_ARG_AREA_NEXT:%.*]] = getelementptr i8, ptr [[OVERFLOW_ARG_AREA]], i32 16
+// CHECK-NEXT:    store ptr [[OVERFLOW_ARG_AREA_NEXT]], ptr [[OVERFLOW_ARG_AREA_P]], align 8
+// CHECK-NEXT:    br label [[VAARG_END]]
+// CHECK:       vaarg.end:
+// CHECK-NEXT:    [[VAARG_ADDR:%.*]] = phi ptr [ [[TMP]], [[VAARG_IN_REG]] ], [ [[OVERFLOW_ARG_AREA]], [[VAARG_IN_MEM]] ]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL]], ptr align 8 [[VAARG_ADDR]], i64 16, i1 false)
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_S6]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[COERCE_DIVE]], align 8
+// CHECK-NEXT:    ret i64 [[TMP5]]
+//
+s6 f6(int z, ...) {
+  __builtin_va_list list;
+  __builtin_va_start(list, z);
+  return __builtin_va_arg(list, s6);
+}
diff --git a/clang/test/Driver/cuda-via-liboffload.cu b/clang/test/Driver/cuda-via-liboffload.cu
new file mode 100644
index 00000000000000..68dc963e906b20
--- /dev/null
+++ b/clang/test/Driver/cuda-via-liboffload.cu
@@ -0,0 +1,23 @@
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN:        --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS %s
+
+//      BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[HOST_BC:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]", "[[HOST_BC]]"], output: "[[PTX_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Packager", inputs: ["[[CUBIN_SM_35]]", "[[CUBIN_SM_70]]"], output: "[[BINARY:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]", "[[BINARY]]"], output: "[[HOST_OBJ:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### -target x86_64-linux-gnu -foffload-via-llvm -ccc-print-bindings \
+// RUN:        --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS-DEVICE %s
+
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX:.+]]"
+// BINDINGS-DEVICE: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX]]"], output: "[[CUBIN:.+]]"
+
+// RUN: %clang -### -target x86_64-linux-gnu -ccc-print-bindings --offload-link -foffload-via-llvm %s 2>&1 | FileCheck -check-prefix DEVICE-LINK %s
+
+// DEVICE-LINK: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[INPUT:.+]]"], output: "a.out"
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index e70715d2a9bd7e..068ea2d7d3c663 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -21,7 +21,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK
 
-// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 {{.*}}.o {{.*}}.o
+// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \
@@ -30,7 +30,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG
 
-// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 {{.*}}.o {{.*}}.o -g 
+// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o -g 
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \
@@ -39,7 +39,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK
 
-// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \
@@ -48,7 +48,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS
 
-// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -Wl,--no-undefined {{.*}}.o -save-temps
+// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -flto -Wl,--no-undefined {{.*}}.o -save-temps
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
@@ -59,7 +59,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN:   --linker-path=/usr/bin/ld.lld --whole-archive %t.a --no-whole-archive \
 // RUN:   %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CPU-LINK
 
-// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o -Wl,-Bsymbolic -shared -Wl,--whole-archive {{.*}}.a -Wl,--no-whole-archive
+// CPU-LINK: clang{{.*}} -o {{.*}}.img --target=x86_64-unknown-linux-gnu -march=native -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -Wl,-Bsymbolic -shared -Wl,--whole-archive {{.*}}.a -Wl,--no-whole-archive
 
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o
 // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -mllvm -openmp-opt-disable \
@@ -148,7 +148,7 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \
 // RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND
 
-// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o
+// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70
@@ -171,8 +171,8 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID
 
-// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
-// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t-lib.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=generic
@@ -187,8 +187,8 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
 // RUN:   --linker-path=/usr/bin/ld %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL
 
-// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
-// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o
 
 // RUN: clang-offload-packager -o %t.out \
 // RUN:   --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \
diff --git a/clang/test/Interpreter/const.cpp b/clang/test/Interpreter/const.cpp
index 57fd880400e6a1..52be75e09ade74 100644
--- a/clang/test/Interpreter/const.cpp
+++ b/clang/test/Interpreter/const.cpp
@@ -2,6 +2,9 @@
 // see https://github.com/llvm/llvm-project/issues/68092
 // XFAIL: host={{.*}}-windows-msvc
 
+// The test is flaky with asan https://github.com/llvm/llvm-project/issues/102858.
+// UNSUPPORTED: asan
+
 // RUN: cat %s | clang-repl | FileCheck %s
 // RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s
 
diff --git a/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp b/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp
new file mode 100644
index 00000000000000..4254780651c5f5
--- /dev/null
+++ b/clang/test/SemaCXX/matrix-index-operator-sign-conversion.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple arm64-apple-macosx -std=c++11 -fenable-matrix -fsyntax-only -verify -Wsign-conversion %s
+
+template <typename T, int R, int C> using m __attribute__((__matrix_type__(R,C))) = T;
+
+// FIXME: should not warn here.
+double index1(m<double,3,1> X, int      i) { return X[i][0]; }
+// expected-warning@-1 {{implicit conversion changes signedness: 'int' to 'unsigned long'}}
+
+double index2(m<double,3,1> X, unsigned i) { return X[i][0]; }
+
+double index3(m<double,3,1> X, char     i) { return X[i][0]; }
+// expected-warning@-1 {{implicit conversion changes signedness: 'char' to 'unsigned long'}}
+
+double index4(m<double,3,1> X, int      i) { return X[0][i]; }
+// expected-warning@-1 {{implicit conversion changes signedness: 'int' to 'unsigned long'}}
+
+double index5(m<double,3,1> X, unsigned i) { return X[0][i]; }
+
+double index6(m<double,3,1> X, char     i) { return X[0][i]; }
+// expected-warning@-1 {{implicit conversion changes signedness: 'char' to 'unsigned long'}}
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 52e6809a122706..9fea1fdcd5fb46 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -527,6 +527,7 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
 
   // Forward all of the `--offload-opt` and similar options to the device.
   if (linkerSupportsLTO(Args)) {
+    CmdArgs.push_back("-flto");
     for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm))
       CmdArgs.append(
           {"-Xlinker",
diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index 7166e4ec4de30d..3b04238b9b48b0 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -1149,6 +1149,17 @@ public class SaleItem
     public decimal Price { get; set; }
 })",
                MicrosoftStyle);
+
+  verifyFormat("internal class Program\n"
+               "{\n"
+               "    bool AutoAllowKnownApps\n"
+               "    {\n"
+               "        get;\n"
+               "        [Simple]\n"
+               "        set;\n"
+               "    }\n"
+               "}",
+               MicrosoftStyle);
 }
 
 TEST_F(FormatTestCSharp, DefaultLiteral) {
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
index 87bb67cfd9327c..29c0b36492a90b 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScanningFilesystemTest.cpp
@@ -174,3 +174,35 @@ TEST(DependencyScanningFilesystem, CacheStatOnExists) {
   EXPECT_EQ(InstrumentingFS->NumStatusCalls, 2u);
   EXPECT_EQ(InstrumentingFS->NumExistsCalls, 0u);
 }
+
+TEST(DependencyScanningFilesystem, CacheStatFailures) {
+  auto InMemoryFS = llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>();
+  InMemoryFS->setCurrentWorkingDirectory("/");
+  InMemoryFS->addFile("/dir/vector", 0, llvm::MemoryBuffer::getMemBuffer(""));
+  InMemoryFS->addFile("/cache/a.pcm", 0, llvm::MemoryBuffer::getMemBuffer(""));
+
+  auto InstrumentingFS =
+      llvm::makeIntrusiveRefCnt<InstrumentingFilesystem>(InMemoryFS);
+
+  DependencyScanningFilesystemSharedCache SharedCache;
+  DependencyScanningWorkerFilesystem DepFS(SharedCache, InstrumentingFS);
+
+  DepFS.status("/dir");
+  DepFS.status("/dir");
+  EXPECT_EQ(InstrumentingFS->NumStatusCalls, 1u);
+
+  DepFS.status("/dir/vector");
+  DepFS.status("/dir/vector");
+  EXPECT_EQ(InstrumentingFS->NumStatusCalls, 2u);
+
+  DepFS.setBypassedPathPrefix("/cache");
+  DepFS.exists("/cache/a.pcm");
+  EXPECT_EQ(InstrumentingFS->NumStatusCalls, 3u);
+  DepFS.exists("/cache/a.pcm");
+  EXPECT_EQ(InstrumentingFS->NumStatusCalls, 4u);
+
+  DepFS.resetBypassedPathPrefix();
+  DepFS.exists("/cache/a.pcm");
+  DepFS.exists("/cache/a.pcm");
+  EXPECT_EQ(InstrumentingFS->NumStatusCalls, 5u);
+}
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index cd8153f60670fc..3e9bd2c23b2fc0 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -110,7 +110,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_restore)
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_restore)
 
 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
-  .variant_pcs __arm_tpidr2_restore
+  .variant_pcs __arm_tpidr2_save
   BTI_C
   // If the current thread does not have access to TPIDR2_EL0, the subroutine
   // does nothing.
@@ -151,7 +151,7 @@ DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_tpidr2_save)
 END_COMPILERRT_OUTLINE_FUNCTION(__arm_tpidr2_save)
 
 DEFINE_COMPILERRT_OUTLINE_FUNCTION_UNMANGLED(__arm_za_disable)
-  .variant_pcs __arm_tpidr2_restore
+  .variant_pcs __arm_za_disable
   BTI_C
   // If the current thread does not have access to SME, the subroutine does
   // nothing.
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index 8ebe37d649415f..35717c610771c1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -788,7 +788,11 @@ void WriteOneLineToSyslog(const char *s) {
   if (GetMacosAlignedVersion() >= MacosVersion(10, 12)) {
     os_log_error(OS_LOG_DEFAULT, "%{public}s", s);
   } else {
+#pragma clang diagnostic push
+// as_log is deprecated.
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
     asl_log(nullptr, nullptr, ASL_LEVEL_ERR, "%s", s);
+#pragma clang diagnostic pop
   }
 #endif
 }
@@ -843,6 +847,9 @@ void LogFullErrorReport(const char *buffer) {
 #if !SANITIZER_GO
   // Log with os_trace. This will make it into the crash log.
 #if SANITIZER_OS_TRACE
+#pragma clang diagnostic push
+// os_trace is deprecated.
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
   if (GetMacosAlignedVersion() >= MacosVersion(10, 10)) {
     // os_trace requires the message (format parameter) to be a string literal.
     if (internal_strncmp(SanitizerToolName, "AddressSanitizer",
@@ -860,6 +867,7 @@ void LogFullErrorReport(const char *buffer) {
     if (common_flags()->log_to_syslog)
       os_trace("Consult syslog for more information.");
   }
+#pragma clang diagnostic pop
 #endif
 
   // Log to syslog.
diff --git a/compiler-rt/lib/scudo/standalone/timing.h b/compiler-rt/lib/scudo/standalone/timing.h
index de741edbff5fff..938b2053d03806 100644
--- a/compiler-rt/lib/scudo/standalone/timing.h
+++ b/compiler-rt/lib/scudo/standalone/timing.h
@@ -14,6 +14,10 @@
 #include "string_utils.h"
 #include "thread_annotations.h"
 
+#ifndef __STDC_FORMAT_MACROS
+// Ensure PRId64 macro is available
+#define __STDC_FORMAT_MACROS 1
+#endif
 #include <inttypes.h>
 #include <string.h>
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp
index e4f9e2915ced2e..9db0eebd923696 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp
@@ -94,6 +94,10 @@ static constexpr morder kMacFailureOrder = mo_relaxed;
   m_orig(int32_t, uint32_t, a32, f##32##OrigBarrier,                           \
     __tsan_atomic32_##tsan_atomic_f, kMacOrderBarrier)
 
+
+#pragma clang diagnostic push
+// OSAtomic* functions are deprecated.
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
 OSATOMIC_INTERCEPTORS_ARITHMETIC(OSAtomicAdd, fetch_add,
                                  OSATOMIC_INTERCEPTOR_PLUS_X)
 OSATOMIC_INTERCEPTORS_ARITHMETIC(OSAtomicIncrement, fetch_add,
@@ -123,6 +127,9 @@ OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicXor, fetch_xor,
         kMacOrderBarrier, kMacFailureOrder);                                \
   }
 
+#pragma clang diagnostic push
+// OSAtomicCompareAndSwap* functions are deprecated.
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
 OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapInt, __tsan_atomic32, a32, int)
 OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapLong, __tsan_atomic64, a64,
                           long_t)
@@ -132,6 +139,7 @@ OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap32, __tsan_atomic32, a32,
                           int32_t)
 OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap64, __tsan_atomic64, a64,
                           int64_t)
+#pragma clang diagnostic pop
 
 #define OSATOMIC_INTERCEPTOR_BITOP(f, op, clear, mo)             \
   TSAN_INTERCEPTOR(bool, f, uint32_t n, volatile void *ptr) {    \
diff --git a/compiler-rt/test/fuzzer/fork-sigusr.test b/compiler-rt/test/fuzzer/fork-sigusr.test
index 4f796171fbd116..088e63cae43118 100644
--- a/compiler-rt/test/fuzzer/fork-sigusr.test
+++ b/compiler-rt/test/fuzzer/fork-sigusr.test
@@ -1,5 +1,6 @@
 # Check that libFuzzer honors SIGUSR1/SIGUSR2
 # Disabled on Windows which does not have SIGUSR1/SIGUSR2.
+REQUIRES: shell
 UNSUPPORTED: darwin, target={{.*windows.*}}, target=aarch64{{.*}}
 RUN: rm -rf %t
 RUN: mkdir -p %t
diff --git a/compiler-rt/test/fuzzer/merge-sigusr.test b/compiler-rt/test/fuzzer/merge-sigusr.test
index 762ae0d106d289..4e492775400b98 100644
--- a/compiler-rt/test/fuzzer/merge-sigusr.test
+++ b/compiler-rt/test/fuzzer/merge-sigusr.test
@@ -1,6 +1,7 @@
 # Check that libFuzzer honors SIGUSR1/SIGUSR2
 # FIXME: Disabled on Windows for now because of reliance on posix only features
 # (eg: export, "&", pkill).
+REQUIRES: shell
 UNSUPPORTED: darwin, target={{.*windows.*}}
 RUN: rm -rf %t
 RUN: mkdir -p %t
diff --git a/compiler-rt/test/fuzzer/sigint.test b/compiler-rt/test/fuzzer/sigint.test
index 0e239c3ce53859..ac482d79b8e282 100644
--- a/compiler-rt/test/fuzzer/sigint.test
+++ b/compiler-rt/test/fuzzer/sigint.test
@@ -1,4 +1,4 @@
-REQUIRES: msan
+REQUIRES: shell, msan
 UNSUPPORTED: target=arm{{.*}}
 
 # Check that libFuzzer exits gracefully under SIGINT with MSan.
diff --git a/compiler-rt/test/fuzzer/sigusr.test b/compiler-rt/test/fuzzer/sigusr.test
index c3d7adf8ea99b3..c8a77ac63a6d7c 100644
--- a/compiler-rt/test/fuzzer/sigusr.test
+++ b/compiler-rt/test/fuzzer/sigusr.test
@@ -1,5 +1,6 @@
 # FIXME: Disabled on Windows for now because of reliance on posix only features
 # (eg: export, "&", pkill).
+REQUIRES: shell
 UNSUPPORTED: darwin, target={{.*windows.*}}
 # Check that libFuzzer honors SIGUSR1/SIGUSR2
 RUN: rm -rf %t
diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index de4d415eda6fd6..3675d9f924876a 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1247,7 +1247,6 @@ inline bool CanCUDASymbolHasSave(const Symbol &sym) {
   if (const auto *details =
           sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
     if (details->cudaDataAttr() &&
-        *details->cudaDataAttr() != common::CUDADataAttr::Pinned &&
         *details->cudaDataAttr() != common::CUDADataAttr::Unified) {
       return false;
     }
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 039dbcb82f7452..fcedf5ec3ddf83 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -2628,10 +2628,7 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic(
   static const std::string dualIntrinsic[]{
       {"etime"s}, {"getcwd"s}, {"rename"s}, {"second"s}};
 
-  return std::find_if(std::begin(dualIntrinsic), std::end(dualIntrinsic),
-             [&name](const std::string &dualName) {
-               return dualName == name;
-             }) != std::end(dualIntrinsic);
+  return llvm::is_contained(dualIntrinsic, name);
 }
 
 IntrinsicClass IntrinsicProcTable::Implementation::GetIntrinsicClass(
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1c0e541e4a36a7..26825468df9b1d 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1700,10 +1700,10 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
     // map for it.
     if (const Fortran::semantics::Symbol *common =
             Fortran::semantics::FindCommonBlockContaining(sym.GetUltimate()))
-      if (llvm::find(mapSyms, common) != mapSyms.end())
+      if (llvm::is_contained(mapSyms, common))
         return;
 
-    if (llvm::find(mapSyms, &sym) == mapSyms.end()) {
+    if (!llvm::is_contained(mapSyms, &sym)) {
       mlir::Value baseOp = converter.getSymbolAddress(sym);
       if (!baseOp)
         if (const auto *details =
diff --git a/flang/test/Evaluate/fold-out_of_range.f90 b/flang/test/Evaluate/fold-out_of_range.f90
index 81551255135d2f..5a9f900beb2d59 100644
--- a/flang/test/Evaluate/fold-out_of_range.f90
+++ b/flang/test/Evaluate/fold-out_of_range.f90
@@ -1,5 +1,6 @@
-! RUN: %python %S/test_folding.py %s %flang_fc1 -pedantic
-! UNSUPPORTED: target=powerpc{{.*}}, target=aarch{{.*}}, target=arm{{.*}}, system-windows, system-solaris
+! RUN: %python %S/test_folding.py %s %flang_fc1 -pedantic -triple x86_64-unknown-linux-gnu
+! UNSUPPORTED: system-windows
+! REQUIRES: target=x86_64{{.*}}
 ! Tests folding of OUT_OF_RANGE().
 module m
   integer(1),  parameter :: i1v(*)  = [ -huge(1_1)  - 1_1,  huge(1_1) ]
diff --git a/flang/test/Lower/CUDA/cuda-program-global.cuf b/flang/test/Lower/CUDA/cuda-program-global.cuf
index a3c9e1ba8d253c..90b401c9ba6a5c 100644
--- a/flang/test/Lower/CUDA/cuda-program-global.cuf
+++ b/flang/test/Lower/CUDA/cuda-program-global.cuf
@@ -6,6 +6,7 @@
 program test
   integer, device :: a(10)
   integer, unified :: u(10)
+  integer, allocatable, pinned :: p(:)
   integer :: b(10)
   integer :: i
   print*,i
@@ -16,6 +17,7 @@ end
 ! CHECK: fir.address_of(@_QFEb) : !fir.ref<!fir.array<10xi32>>
 ! CHECK: %[[ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
 ! CHECK: hlfir.declare %[[ALLOCA]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.alloc !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "p", data_attr = #cuf.cuda<pinned>, uniq_name = "_QFEp"} -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
 ! CHECK-NOT: fir.global internal @_QFEa {data_attr = #cuf.cuda<device>} : !fir.array<10xi32> {{{$}}
 ! CHECK: fir.global internal @_QFEb : !fir.array<10xi32> {{{$}}
diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 8f898f0150905a..ebe053af99d803 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.assert
     libc.include.ctype
     libc.include.dlfcn
+    libc.include.elf
     libc.include.errno
     libc.include.features
     libc.include.fenv
@@ -9,6 +10,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.stdint
     libc.include.inttypes
     libc.include.limits
+    libc.include.link
     libc.include.math
     libc.include.pthread
     libc.include.signal
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 748401e4cf8ee8..65c5757efe6274 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -597,6 +597,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.exp10f16
     libc.src.math.exp2f16
     libc.src.math.expf16
+    libc.src.math.expm1f16
     libc.src.math.f16add
     libc.src.math.f16addf
     libc.src.math.f16addl
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 0294f62bc2f7a0..77e454e64395df 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -3,6 +3,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.ctype
     libc.include.dirent
     libc.include.dlfcn
+    libc.include.elf
     libc.include.errno
     libc.include.fcntl
     libc.include.features
@@ -11,6 +12,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.stdint
     libc.include.inttypes
     libc.include.limits
+    libc.include.link
     libc.include.math
     libc.include.pthread
     libc.include.sched
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index b617289d1364fd..185d2d440849a0 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -294,7 +294,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp2m1    | |check|          |                 |                        |                      |                        | 7.12.6.5               | F.10.3.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| expm1     | |check|          | |check|         |                        |                      |                        | 7.12.6.6               | F.10.3.6                   |
+| expm1     | |check|          | |check|         |                        | |check|              |                        | 7.12.6.6               | F.10.3.6                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fma       | |check|          | |check|         |                        |                      |                        | 7.12.13.1              | F.10.10.1                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index cbde24e17619f6..2b6eb61782a632 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -420,6 +420,23 @@ add_header_macro(
     .llvm-libc-types.posix_spawn_file_actions_t
 )
 
+add_gen_header(
+  link
+  DEF_FILE link.h.def
+  GEN_HDR link.h
+  DEPENDS
+    .llvm_libc_common_h
+    .llvm-libc-macros.link_macros
+)
+
+add_gen_header(
+  elf
+  DEF_FILE elf.h.def
+  GEN_HDR elf.h
+  DEPENDS
+    .llvm-libc-macros.elf_macros
+)
+
 # TODO: Not all platforms will have a include/sys directory. Add the sys
 # directory and the targets for sys/*.h files conditional to the OS requiring
 # them.
diff --git a/libc/include/elf.h.def b/libc/include/elf.h.def
new file mode 100644
index 00000000000000..b9b2604fc7167b
--- /dev/null
+++ b/libc/include/elf.h.def
@@ -0,0 +1,17 @@
+//===-- System V header elf.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_ELF_H
+#define LLVM_LIBC_ELF_H
+
+#include "__llvm-libc-common.h"
+#include "llvm-libc-macros/elf-macros.h"
+
+%%public_api()
+
+#endif // LLVM_LIBC_ELF_H
diff --git a/libc/include/link.h.def b/libc/include/link.h.def
new file mode 100644
index 00000000000000..ebab81c841b8dd
--- /dev/null
+++ b/libc/include/link.h.def
@@ -0,0 +1,17 @@
+//===-- GNU header link.h -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_LINK_H
+#define LLVM_LIBC_LINK_H
+
+#include "__llvm-libc-common.h"
+#include "llvm-libc-macros/link-macros.h"
+
+%%public_api()
+
+#endif // LLVM_LIBC_LINK_H
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 3c10abef8768c0..60a8725f9ef63f 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -289,3 +289,9 @@ add_macro_header(
   HDR
     dlfcn-macros.h
 )
+
+add_macro_header(
+  elf_macros
+  HDR
+    elf-macros.h
+)
diff --git a/libc/include/llvm-libc-macros/elf-macros.h b/libc/include/llvm-libc-macros/elf-macros.h
new file mode 100644
index 00000000000000..fa4442abf0f5ce
--- /dev/null
+++ b/libc/include/llvm-libc-macros/elf-macros.h
@@ -0,0 +1,18 @@
+//===-- Definition of macros from elf.h -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_MACROS_ELF_MACROS_H
+#define LLVM_LIBC_MACROS_ELF_MACROS_H
+
+#if __has_include(<linux/elf.h>)
+#include <linux/elf.h>
+#else
+#error "cannot use <sys/elf.h> without proper system headers."
+#endif
+
+#endif // LLVM_LIBC_MACROS_ELF_MACROS_H
diff --git a/libc/include/llvm-libc-macros/link-macros.h b/libc/include/llvm-libc-macros/link-macros.h
index 5c8cadab8e71cb..89e7bb50aa5566 100644
--- a/libc/include/llvm-libc-macros/link-macros.h
+++ b/libc/include/llvm-libc-macros/link-macros.h
@@ -6,8 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_LIBC_MACROS_LINK_MACROS_H
+#define LLVM_LIBC_MACROS_LINK_MACROS_H
+
+#include "elf-macros.h"
+
 #ifdef __LP64__
-#define ElfW(type) Elf64_ ## type
+#define ElfW(type) Elf64_##type
 #else
-#define ElfW(type) Elf32_ ## type
+#define ElfW(type) Elf32_##type
+#endif
+
 #endif
diff --git a/libc/newhdrgen/yaml/elf.yaml b/libc/newhdrgen/yaml/elf.yaml
new file mode 100644
index 00000000000000..2e9db329e22979
--- /dev/null
+++ b/libc/newhdrgen/yaml/elf.yaml
@@ -0,0 +1,8 @@
+header: elf.h
+standards:
+  - Linux
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/newhdrgen/yaml/link.yaml b/libc/newhdrgen/yaml/link.yaml
new file mode 100644
index 00000000000000..d1963a86813af3
--- /dev/null
+++ b/libc/newhdrgen/yaml/link.yaml
@@ -0,0 +1,8 @@
+header: link.h
+standards:
+  - Linux
+macros: []
+types: []
+enums: []
+objects: []
+functions: []
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 077f66e78c1167..e06a4f9b268e66 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -599,6 +599,7 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"expm1", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"expm1f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          GuardedFunctionSpec<"expm1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
           FunctionSpec<"exp10", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"exp10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 9f13205a60702e..132511a536366c 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -121,6 +121,7 @@ add_math_entrypoint_object(exp10f16)
 
 add_math_entrypoint_object(expm1)
 add_math_entrypoint_object(expm1f)
+add_math_entrypoint_object(expm1f16)
 
 add_math_entrypoint_object(f16add)
 add_math_entrypoint_object(f16addf)
diff --git a/libc/src/math/expm1f16.h b/libc/src/math/expm1f16.h
new file mode 100644
index 00000000000000..644e6cddd7666a
--- /dev/null
+++ b/libc/src/math/expm1f16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for expm1f16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXPM1F16_H
+#define LLVM_LIBC_SRC_MATH_EXPM1F16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 expm1f16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_EXPM1F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 79ba07616e507f..745bd65e1d75b5 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1359,16 +1359,15 @@ add_entrypoint_object(
   HDRS
     ../expf16.h
   DEPENDS
+    .expxf16
     libc.hdr.errno_macros
     libc.hdr.fenv_macros
-    libc.src.__support.CPP.array
     libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.nearest_integer
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.attributes
     libc.src.__support.macros.optimization
   COMPILE_OPTIONS
     -O3
@@ -1608,6 +1607,27 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  expm1f16
+  SRCS
+    expm1f16.cpp
+  HDRS
+    ../expm1f16.h
+  DEPENDS
+    .expxf16
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   powf
   SRCS
@@ -5092,4 +5112,7 @@ add_header_library(
     expxf16.h
   DEPENDS
     libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
 )
diff --git a/libc/src/math/generic/expf16.cpp b/libc/src/math/generic/expf16.cpp
index b198c559dfedb9..7ffdbd5191008a 100644
--- a/libc/src/math/generic/expf16.cpp
+++ b/libc/src/math/generic/expf16.cpp
@@ -7,15 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/expf16.h"
+#include "expxf16.h"
 #include "hdr/errno_macros.h"
 #include "hdr/fenv_macros.h"
-#include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
 #include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
@@ -41,28 +39,6 @@ static constexpr fputil::ExceptValues<float16, 3> EXPF16_EXCEPTS_HI = {{
     {0xa954U, 0x3bacU, 1U, 0U, 0U},
 }};
 
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > for i from -18 to 12 do print(round(exp(i), SG, RN));
-static constexpr cpp::array<float, 31> EXP_HI = {
-    0x1.05a628p-26f, 0x1.639e32p-25f, 0x1.e355bcp-24f, 0x1.4875cap-22f,
-    0x1.be6c7p-21f,  0x1.2f6054p-19f, 0x1.9c54c4p-18f, 0x1.183542p-16f,
-    0x1.7cd79cp-15f, 0x1.02cf22p-13f, 0x1.5fc21p-12f,  0x1.de16bap-11f,
-    0x1.44e52p-9f,   0x1.b993fep-8f,  0x1.2c155cp-6f,  0x1.97db0cp-5f,
-    0x1.152aaap-3f,  0x1.78b564p-2f,  0x1p+0f,         0x1.5bf0a8p+1f,
-    0x1.d8e64cp+2f,  0x1.415e5cp+4f,  0x1.b4c902p+5f,  0x1.28d38ap+7f,
-    0x1.936dc6p+8f,  0x1.122886p+10f, 0x1.749ea8p+11f, 0x1.fa7158p+12f,
-    0x1.5829dcp+14f, 0x1.d3c448p+15f, 0x1.3de166p+17f,
-};
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > for i from 0 to 7 do print(round(exp(i * 2^-3), SG, RN));
-static constexpr cpp::array<float, 8> EXP_MID = {
-    0x1p+0f,        0x1.221604p+0f, 0x1.48b5e4p+0f, 0x1.747a52p+0f,
-    0x1.a61298p+0f, 0x1.de455ep+0f, 0x1.0ef9dcp+1f, 0x1.330e58p+1f,
-};
-
 LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) {
   using FPBits = fputil::FPBits<float16>;
   FPBits x_bits(x);
@@ -135,38 +111,9 @@ LLVM_LIBC_FUNCTION(float16, expf16, (float16 x)) {
   if (auto r = EXPF16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
     return r.value();
 
-  // For -18 < x < 12, to compute exp(x), we perform the following range
-  // reduction: find hi, mid, lo, such that:
-  //   x = hi + mid + lo, in which
-  //     hi is an integer,
-  //     mid * 2^3 is an integer,
-  //     -2^(-4) <= lo < 2^(-4).
-  // In particular,
-  //   hi + mid = round(x * 2^3) * 2^(-3).
-  // Then,
-  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
-  // We store exp(hi) and exp(mid) in the lookup tables EXP_HI and EXP_MID
-  // respectively.  exp(lo) is computed using a degree-3 minimax polynomial
-  // generated by Sollya.
-
-  float xf = x;
-  float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
-  int x_hi_mid = static_cast<int>(kf);
-  int x_hi = x_hi_mid >> 3;
-  int x_mid = x_hi_mid & 0x7;
-  // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
-  float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
-
-  float exp_hi = EXP_HI[x_hi + 18];
-  float exp_mid = EXP_MID[x_mid];
-  // Degree-3 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-4, 2^-4]);
-  //   > 1 + x * P;
-  float exp_lo =
-      fputil::polyeval(lo, 0x1p+0f, 0x1p+0f, 0x1.001p-1f, 0x1.555ddep-3f);
-  return static_cast<float16>(exp_hi * exp_mid * exp_lo);
+  // exp(x) = exp(hi + mid) * exp(lo)
+  auto [exp_hi_mid, exp_lo] = exp_range_reduction(x);
+  return static_cast<float16>(exp_hi_mid * exp_lo);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expm1f16.cpp b/libc/src/math/generic/expm1f16.cpp
new file mode 100644
index 00000000000000..0facdc510e4287
--- /dev/null
+++ b/libc/src/math/generic/expm1f16.cpp
@@ -0,0 +1,132 @@
+//===-- Half-precision e^x - 1 function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expm1f16.h"
+#include "expxf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 1> EXPM1F16_EXCEPTS_LO = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.564p-5, expm1f16(x) = 0x1.5d4p-5 (RZ)
+    {0x2959U, 0x2975U, 1U, 0U, 1U},
+}};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2;
+#else
+static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3;
+#endif
+
+static constexpr fputil::ExceptValues<float16, N_EXPM1F16_EXCEPTS_HI>
+    EXPM1F16_EXCEPTS_HI = {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.c34p+0, expm1f16(x) = 0x1.34cp+2 (RZ)
+        {0x3f0dU, 0x44d3U, 1U, 0U, 1U},
+        // x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ)
+        {0xb38aU, 0xb2b7U, 0U, 1U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+        // x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ)
+        {0x3282U, 0x3337U, 1U, 0U, 0U},
+#endif
+    }};
+
+LLVM_LIBC_FUNCTION(float16, expm1f16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| <= 2^(-3), or |x| >= -11 * log(2), or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x47a0U)) {
+    // expm1(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // expm1(+/-0) = +/-0
+    if (x_abs == 0)
+      return x;
+
+    // When x >= 16 * log(2).
+    if (x_bits.is_pos() && x_abs >= 0x498cU) {
+      // expm1(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x <= -11 * log(2).
+    if (x_u >= 0xc7a0U) {
+      // expm1(-inf) = -1
+      if (x_bits.is_inf())
+        return FPBits::one(Sign::NEG).get_val();
+
+      // When x > -0x1.0ap+3, round(expm1(x), HP, RN) = -1.
+      if (x_u > 0xc828U)
+        return fputil::round_result_slightly_up(
+            FPBits::one(Sign::NEG).get_val());
+      // When x <= -0x1.0ap+3, round(expm1(x), HP, RN) = -0x1.ffcp-1.
+      return fputil::round_result_slightly_down(
+          static_cast<float16>(-0x1.ffcp-1));
+    }
+
+    // When 0 < |x| <= 2^(-3).
+    if (x_abs <= 0x3000U && !x_bits.is_zero()) {
+      if (auto r = EXPM1F16_EXCEPTS_LO.lookup(x_u);
+          LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      float xf = x;
+      // Degree-5 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax(expm1(x)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+      //   > x * P;
+      return static_cast<float16>(
+          xf * fputil::polyeval(xf, 0x1p+0f, 0x1.fffff8p-2f, 0x1.555556p-3f,
+                                0x1.55905ep-5f, 0x1.1124c2p-7f));
+    }
+  }
+
+  if (auto r = EXPM1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // exp(x) = exp(hi + mid) * exp(lo)
+  auto [exp_hi_mid, exp_lo] = exp_range_reduction(x);
+  // expm1(x) = exp(hi + mid) * exp(lo) - 1
+  return static_cast<float16>(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0f));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index c33aca337b98dc..a0db6cee438e92 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -10,11 +10,77 @@
 #define LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
 
 #include "src/__support/CPP/array.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
 #include <stdint.h>
 
 namespace LIBC_NAMESPACE_DECL {
 
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from -18 to 12 do print(round(exp(i), SG, RN));
+static constexpr cpp::array<float, 31> EXP_HI = {
+    0x1.05a628p-26f, 0x1.639e32p-25f, 0x1.e355bcp-24f, 0x1.4875cap-22f,
+    0x1.be6c7p-21f,  0x1.2f6054p-19f, 0x1.9c54c4p-18f, 0x1.183542p-16f,
+    0x1.7cd79cp-15f, 0x1.02cf22p-13f, 0x1.5fc21p-12f,  0x1.de16bap-11f,
+    0x1.44e52p-9f,   0x1.b993fep-8f,  0x1.2c155cp-6f,  0x1.97db0cp-5f,
+    0x1.152aaap-3f,  0x1.78b564p-2f,  0x1p+0f,         0x1.5bf0a8p+1f,
+    0x1.d8e64cp+2f,  0x1.415e5cp+4f,  0x1.b4c902p+5f,  0x1.28d38ap+7f,
+    0x1.936dc6p+8f,  0x1.122886p+10f, 0x1.749ea8p+11f, 0x1.fa7158p+12f,
+    0x1.5829dcp+14f, 0x1.d3c448p+15f, 0x1.3de166p+17f,
+};
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from 0 to 7 do print(round(exp(i * 2^-3), SG, RN));
+static constexpr cpp::array<float, 8> EXP_MID = {
+    0x1p+0f,        0x1.221604p+0f, 0x1.48b5e4p+0f, 0x1.747a52p+0f,
+    0x1.a61298p+0f, 0x1.de455ep+0f, 0x1.0ef9dcp+1f, 0x1.330e58p+1f,
+};
+
+struct ExpRangeReduction {
+  float exp_hi_mid;
+  float exp_lo;
+};
+
+LIBC_INLINE ExpRangeReduction exp_range_reduction(float16 x) {
+  // For -18 < x < 12, to compute exp(x), we perform the following range
+  // reduction: find hi, mid, lo, such that:
+  //   x = hi + mid + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   exp(x) = exp(hi + mid + lo) = exp(hi) * exp(mid) * exp(lo).
+  // We store exp(hi) and exp(mid) in the lookup tables EXP_HI and EXP_MID
+  // respectively.  exp(lo) is computed using a degree-3 minimax polynomial
+  // generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
+  int x_hi_mid = static_cast<int>(kf);
+  int x_hi = x_hi_mid >> 3;
+  int x_mid = x_hi_mid & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
+
+  float exp_hi = EXP_HI[x_hi + 18];
+  float exp_mid = EXP_MID[x_mid];
+  // Degree-3 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp_lo =
+      fputil::polyeval(lo, 0x1p+0f, 0x1p+0f, 0x1.001p-1f, 0x1.555ddep-3f);
+  return {exp_hi * exp_mid, exp_lo};
+}
+
 // Generated by Sollya with the following commands:
 //   > display = hexadecimal;
 //   > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN));
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 2749908ef18495..43752a4942ad56 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -234,4 +234,36 @@ template <typename T> struct FPTest : public Test {
 #define EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(expected, actual)                    \
   EXPECT_FP_EQ_ROUNDING_MODE((expected), (actual), RoundingMode::TowardZero)
 
+#define EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_MODE(                             \
+    expected, actual, expected_except, rounding_mode)                          \
+  do {                                                                         \
+    using namespace LIBC_NAMESPACE::fputil::testing;                           \
+    ForceRoundingMode __r((rounding_mode));                                    \
+    if (__r.success) {                                                         \
+      LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);                     \
+      EXPECT_FP_EQ((expected), (actual));                                      \
+      EXPECT_FP_EXCEPTION(expected_except);                                    \
+    }                                                                          \
+  } while (0)
+
+#define EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(expected, actual,         \
+                                                     expected_except)          \
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_MODE(                                   \
+      (expected), (actual), (expected_except), RoundingMode::Nearest)
+
+#define EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(expected, actual,          \
+                                                    expected_except)           \
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_MODE(                                   \
+      (expected), (actual), (expected_except), RoundingMode::Upward)
+
+#define EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(expected, actual,        \
+                                                      expected_except)         \
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_MODE(                                   \
+      (expected), (actual), (expected_except), RoundingMode::Downward)
+
+#define EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(expected, actual,     \
+                                                         expected_except)      \
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_MODE(                                   \
+      (expected), (actual), (expected_except), RoundingMode::TowardZero)
+
 #endif // LLVM_LIBC_TEST_UNITTEST_FPMATCHER_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index a8da72fb01be61..0c4118c3694548 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1676,6 +1676,19 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  expm1_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    expm1_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.expm1
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   expm1f_test
   NEED_MPFR
@@ -1690,16 +1703,14 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
- expm1_test
- NEED_MPFR
- SUITE
-   libc-math-unittests
- SRCS
-   expm1_test.cpp
- DEPENDS
-   libc.src.errno.errno
-   libc.src.math.expm1
-   libc.src.__support.FPUtil.fp_bits
+  expm1f16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    expm1f16_test.cpp
+  DEPENDS
+    libc.src.math.expm1f16
 )
 
 add_fp_unittest(
diff --git a/libc/test/src/math/expm1f16_test.cpp b/libc/test/src/math/expm1f16_test.cpp
new file mode 100644
index 00000000000000..a6a6fcf73d383f
--- /dev/null
+++ b/libc/test/src/math/expm1f16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for expm1f16 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/expm1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcExpm1f16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
+                                   LIBC_NAMESPACE::expm1f16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExpm1f16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
+                                   LIBC_NAMESPACE::expm1f16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 460797b74a13e2..7271e933b9311d 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3324,6 +3324,18 @@ add_fp_unittest(
     libc.src.math.fma
 )
 
+add_fp_unittest(
+  expm1_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    expm1_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.expm1
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_fp_unittest(
   expm1f_test
   SUITE
@@ -3337,15 +3349,16 @@ add_fp_unittest(
 )
 
 add_fp_unittest(
-  expm1_test
+  expm1f16_test
   SUITE
     libc-math-smoke-tests
   SRCS
-    expm1_test.cpp
+    expm1f16_test.cpp
   DEPENDS
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
     libc.src.errno.errno
-    libc.src.math.expm1
-    libc.src.__support.FPUtil.fp_bits
+    libc.src.math.expm1f16
 )
 
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/expm1f16_test.cpp b/libc/test/src/math/smoke/expm1f16_test.cpp
new file mode 100644
index 00000000000000..3bdbaad2279416
--- /dev/null
+++ b/libc/test/src/math/smoke/expm1f16_test.cpp
@@ -0,0 +1,108 @@
+//===-- Unittests for expm1f16 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/expm1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expm1f16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1f16(sNaN), FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::expm1f16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(static_cast<float16>(-1.0),
+                            LIBC_NAMESPACE::expm1f16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::expm1f16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::expm1f16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExpm1f16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expm1f16(max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  // round(16 * log(2), HP, RN);
+  float16 x = static_cast<float16>(0x1.63p+3);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::expm1f16(x),
+                                               FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::expm1f16(x),
+                                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      max_normal, LIBC_NAMESPACE::expm1f16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      max_normal, LIBC_NAMESPACE::expm1f16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExpm1f16Test, ResultNearNegOne) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(static_cast<float16>(-1.0),
+                              LIBC_NAMESPACE::expm1f16(neg_max_normal),
+                              FE_INEXACT);
+
+  // round(-11 * log(2), HP, RN);
+  float16 x = static_cast<float16>(-0x1.e8p+2);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      static_cast<float16>(-0x1.ffcp-1), LIBC_NAMESPACE::expm1f16(x),
+      FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(static_cast<float16>(-0x1.ffcp-1),
+                                              LIBC_NAMESPACE::expm1f16(x),
+                                              FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      static_cast<float16>(-1.0), LIBC_NAMESPACE::expm1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      static_cast<float16>(-0x1.ffcp-1), LIBC_NAMESPACE::expm1f16(x),
+      FE_INEXACT);
+
+  x = static_cast<float16>(-0x1.0a4p+3);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      static_cast<float16>(-1.0), LIBC_NAMESPACE::expm1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(static_cast<float16>(-0x1.ffcp-1),
+                                              LIBC_NAMESPACE::expm1f16(x),
+                                              FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      static_cast<float16>(-1.0), LIBC_NAMESPACE::expm1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      static_cast<float16>(-0x1.ffcp-1), LIBC_NAMESPACE::expm1f16(x),
+      FE_INEXACT);
+}
diff --git a/libcxx/include/__memory/inout_ptr.h b/libcxx/include/__memory/inout_ptr.h
index 72e1a21ad68671..e5f3ac5d027e8e 100644
--- a/libcxx/include/__memory/inout_ptr.h
+++ b/libcxx/include/__memory/inout_ptr.h
@@ -63,17 +63,17 @@ class _LIBCPP_TEMPLATE_VIS inout_ptr_t {
       }
     }
 
-    using _SP = __pointer_of_or_t<_Smart, _Pointer>;
+    using _SmartPtr = __pointer_of_or_t<_Smart, _Pointer>;
     if constexpr (is_pointer_v<_Smart>) {
-      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SmartPtr>(__p_), std::forward<_Args>(__args)...); },
                  std::move(__a_));
     } else if constexpr (__resettable_smart_pointer_with_args<_Smart, _Pointer, _Args...>) {
-      std::apply([&](auto&&... __args) { __s_.reset(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+      std::apply([&](auto&&... __args) { __s_.reset(static_cast<_SmartPtr>(__p_), std::forward<_Args>(__args)...); },
                  std::move(__a_));
     } else {
-      static_assert(is_constructible_v<_Smart, _SP, _Args...>,
+      static_assert(is_constructible_v<_Smart, _SmartPtr, _Args...>,
                     "The smart pointer must be constructible from arguments of types _Smart, _Pointer, _Args...");
-      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SmartPtr>(__p_), std::forward<_Args>(__args)...); },
                  std::move(__a_));
     }
   }
diff --git a/libcxx/include/__memory/out_ptr.h b/libcxx/include/__memory/out_ptr.h
index 95aa2029c92314..fd99110790cc89 100644
--- a/libcxx/include/__memory/out_ptr.h
+++ b/libcxx/include/__memory/out_ptr.h
@@ -58,14 +58,14 @@ class _LIBCPP_TEMPLATE_VIS out_ptr_t {
       return;
     }
 
-    using _SP = __pointer_of_or_t<_Smart, _Pointer>;
+    using _SmartPtr = __pointer_of_or_t<_Smart, _Pointer>;
     if constexpr (__resettable_smart_pointer_with_args<_Smart, _Pointer, _Args...>) {
-      std::apply([&](auto&&... __args) { __s_.reset(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+      std::apply([&](auto&&... __args) { __s_.reset(static_cast<_SmartPtr>(__p_), std::forward<_Args>(__args)...); },
                  std::move(__a_));
     } else {
-      static_assert(is_constructible_v<_Smart, _SP, _Args...>,
+      static_assert(is_constructible_v<_Smart, _SmartPtr, _Args...>,
                     "The smart pointer must be constructible from arguments of types _Smart, _Pointer, _Args...");
-      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SP>(__p_), std::forward<_Args>(__args)...); },
+      std::apply([&](auto&&... __args) { __s_ = _Smart(static_cast<_SmartPtr>(__p_), std::forward<_Args>(__args)...); },
                  std::move(__a_));
     }
   }
diff --git a/libcxx/test/libcxx/system_reserved_names.gen.py b/libcxx/test/libcxx/system_reserved_names.gen.py
index 0d935a18addeee..956a8d1abe3c3c 100644
--- a/libcxx/test/libcxx/system_reserved_names.gen.py
+++ b/libcxx/test/libcxx/system_reserved_names.gen.py
@@ -17,7 +17,8 @@
 from libcxx.header_information import lit_header_restrictions, public_headers
 
 for header in public_headers:
-  print(f"""\
+    print(
+        f"""\
 //--- {header}.compile.pass.cpp
 {lit_header_restrictions.get(header, '')}
 
@@ -162,6 +163,18 @@
 #define erase SYSTEM_RESERVED_NAME
 #define refresh SYSTEM_RESERVED_NAME
 
+// Dinkumware libc ctype.h uses these definitions
+#define _XA SYSTEM_RESERVED_NAME
+#define _XS SYSTEM_RESERVED_NAME
+#define _BB SYSTEM_RESERVED_NAME
+#define _CN SYSTEM_RESERVED_NAME
+#define _DI SYSTEM_RESERVED_NAME
+#define _LO SYSTEM_RESERVED_NAME
+#define _PU SYSTEM_RESERVED_NAME
+#define _SP SYSTEM_RESERVED_NAME
+#define _UP SYSTEM_RESERVED_NAME
+#define _XD SYSTEM_RESERVED_NAME
+
 #include <{header}>
 
 // Make sure we don't swallow the definition of the macros we push/pop
@@ -172,4 +185,5 @@
 static_assert(__builtin_strcmp(STRINGIFY(move), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
 static_assert(__builtin_strcmp(STRINGIFY(erase), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
 static_assert(__builtin_strcmp(STRINGIFY(refresh), STRINGIFY(SYSTEM_RESERVED_NAME)) == 0, "");
-""")
+"""
+    )
diff --git a/libcxx/utils/synchronize_csv_status_files.py b/libcxx/utils/synchronize_csv_status_files.py
new file mode 100755
index 00000000000000..b44b02f5304c0a
--- /dev/null
+++ b/libcxx/utils/synchronize_csv_status_files.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python3
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from typing import List, Dict, Tuple, Optional
+import csv
+import itertools
+import json
+import os
+import pathlib
+import re
+import subprocess
+
+# Number of the 'Libc++ Standards Conformance' project on Github
+LIBCXX_CONFORMANCE_PROJECT = '31'
+
+class PaperInfo:
+    paper_number: str
+    """
+    Identifier for the paper or the LWG issue. This must be something like 'PnnnnRx', 'Nxxxxx' or 'LWGxxxxx'.
+    """
+
+    paper_name: str
+    """
+    Plain text string representing the name of the paper.
+    """
+
+    meeting: Optional[str]
+    """
+    Plain text string representing the meeting at which the paper/issue was voted.
+    """
+
+    status: Optional[str]
+    """
+    Status of the paper/issue. This must be '|Complete|', '|Nothing To Do|', '|In Progress|',
+    '|Partial|' or 'Resolved by <something>'.
+    """
+
+    first_released_version: Optional[str]
+    """
+    First version of LLVM in which this paper/issue was resolved.
+    """
+
+    labels: Optional[List[str]]
+    """
+    List of labels to associate to the issue in the status-tracking table. Supported labels are
+    'format', 'ranges', 'spaceship', 'flat_containers', 'concurrency TS' and 'DR'.
+    """
+
+    original: Optional[object]
+    """
+    Object from which this PaperInfo originated. This is used to track the CSV row or Github issue that
+    was used to generate this PaperInfo and is useful for error reporting purposes.
+    """
+
+    def __init__(self, paper_number: str, paper_name: str,
+                       meeting: Optional[str] = None,
+                       status: Optional[str] = None,
+                       first_released_version: Optional[str] = None,
+                       labels: Optional[List[str]] = None,
+                       original: Optional[object] = None):
+        self.paper_number = paper_number
+        self.paper_name = paper_name
+        self.meeting = meeting
+        self.status = status
+        self.first_released_version = first_released_version
+        self.labels = labels
+        self.original = original
+
+    def for_printing(self) -> Tuple[str, str, str, str, str, str]:
+        return (
+            f'`{self.paper_number} <https://wg21.link/{self.paper_number}>`__',
+            self.paper_name,
+            self.meeting if self.meeting is not None else '',
+            self.status if self.status is not None else '',
+            self.first_released_version if self.first_released_version is not None else '',
+            ' '.join(f'|{label}|' for label in self.labels) if self.labels is not None else '',
+        )
+
+    def __repr__(self) -> str:
+        return repr(self.original) if self.original is not None else repr(self.for_printing())
+
+    def is_implemented(self) -> bool:
+        if self.status is None:
+            return False
+        if re.search(r'(in progress|partial)', self.status.lower()):
+            return False
+        return True
+
+    @staticmethod
+    def from_csv_row(row: Tuple[str, str, str, str, str, str]):# -> PaperInfo:
+        """
+        Given a row from one of our status-tracking CSV files, create a PaperInfo object representing that row.
+        """
+        # Extract the paper number from the first column
+        match = re.search(r"((P[0-9R]+)|(LWG[0-9]+)|(N[0-9]+))\s+", row[0])
+        if match is None:
+            raise RuntimeError(f"Can't parse paper/issue number out of row: {row}")
+
+        return PaperInfo(
+            paper_number=match.group(1),
+            paper_name=row[1],
+            meeting=row[2] or None,
+            status=row[3] or None,
+            first_released_version=row[4] or None,
+            labels=[l.strip('|') for l in row[5].split(' ') if l] or None,
+            original=row,
+        )
+
+    @staticmethod
+    def from_github_issue(issue: Dict):# -> PaperInfo:
+        """
+        Create a PaperInfo object from the Github issue information obtained from querying a Github Project.
+        """
+        # Extract the paper number from the issue title
+        match = re.search(r"((P[0-9R]+)|(LWG[0-9]+)|(N[0-9]+)):", issue['title'])
+        if match is None:
+            raise RuntimeError(f"Issue doesn't have a title that we know how to parse: {issue}")
+        paper = match.group(1)
+
+        # Figure out the status of the paper according to the Github project information.
+        #
+        # Sadly, we can't make a finer-grained distiction about *how* the issue
+        # was closed (such as Nothing To Do or similar).
+        status = '|Complete|' if 'status' in issue and issue['status'] == 'Done' else None
+
+        # Handle labels
+        valid_labels = ('format', 'ranges', 'spaceship', 'flat_containers', 'concurrency TS', 'DR')
+        labels = [label for label in issue['labels'] if label in valid_labels]
+
+        return PaperInfo(
+            paper_number=paper,
+            paper_name=issue['title'],
+            meeting=issue.get('meeting Voted', None),
+            status=status,
+            first_released_version=None, # TODO
+            labels=labels if labels else None,
+            original=issue,
+        )
+
+def load_csv(file: pathlib.Path) -> List[Tuple]:
+    rows = []
+    with open(file, newline='') as f:
+        reader = csv.reader(f, delimiter=',')
+        for row in reader:
+            rows.append(row)
+    return rows
+
+def write_csv(output: pathlib.Path, rows: List[Tuple]):
+    with open(output, 'w', newline='') as f:
+        writer = csv.writer(f, quoting=csv.QUOTE_ALL, lineterminator='\n')
+        for row in rows:
+            writer.writerow(row)
+
+def sync_csv(rows: List[Tuple], from_github: List[PaperInfo]) -> List[Tuple]:
+    """
+    Given a list of CSV rows representing an existing status file and a list of PaperInfos representing
+    up-to-date (but potentially incomplete) tracking information from Github, this function returns the
+    new CSV rows synchronized with the up-to-date information.
+
+    Note that this only tracks changes from 'not implemented' issues to 'implemented'. If an up-to-date
+    PaperInfo reports that a paper is not implemented but the existing CSV rows report it as implemented,
+    it is an error (i.e. the result is not a CSV row where the paper is *not* implemented).
+    """
+    results = [rows[0]] # Start with the header
+    for row in rows[1:]: # Skip the header
+        # If the row contains empty entries, this is a "separator row" between meetings.
+        # Preserve it as-is.
+        if row[0] == "":
+            results.append(row)
+            continue
+
+        paper = PaperInfo.from_csv_row(row)
+
+        # If the row is already implemented, basically keep it unchanged but also validate that we're not
+        # out-of-sync with any still-open Github issue tracking the same paper.
+        if paper.is_implemented():
+            dangling = [gh for gh in from_github if gh.paper_number == paper.paper_number and not gh.is_implemented()]
+            if dangling:
+                raise RuntimeError(f"We found the following open tracking issues for a row which is already marked as implemented:\nrow: {row}\ntracking issues: {dangling}")
+            results.append(paper.for_printing())
+        else:
+            # Find any Github issues tracking this paper
+            tracking = [gh for gh in from_github if paper.paper_number == gh.paper_number]
+
+            # If there is no tracking issue for that row in the CSV, this is an error since we're
+            # missing a Github issue.
+            if not tracking:
+                raise RuntimeError(f"Can't find any Github issue for CSV row which isn't marked as done yet: {row}")
+
+            # If there's more than one tracking issue, something is weird too.
+            if len(tracking) > 1:
+                raise RuntimeError(f"Found a row with more than one tracking issue: {row}\ntracked by: {tracking}")
+
+            # If the issue is closed, synchronize the row based on the Github issue. Otherwise, use the
+            # existing CSV row as-is.
+            results.append(tracking[0].for_printing() if tracking[0].is_implemented() else row)
+
+    return results
+
+CSV_FILES_TO_SYNC = [
+    'Cxx14Issues.csv',
+    'Cxx17Issues.csv',
+    'Cxx17Papers.csv',
+    'Cxx20Issues.csv',
+    'Cxx20Papers.csv',
+    # TODO: The Github issues are not created yet.
+    # 'Cxx23Issues.csv',
+    # 'Cxx23Papers.csv',
+    # 'Cxx2cIssues.csv',
+    # 'Cxx2cPapers.csv',
+]
+
+def main():
+    libcxx_root = pathlib.Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+    # Extract the list of PaperInfos from issues we're tracking on Github.
+    print("Loading all issues from Github")
+    gh_command_line = ['gh', 'project', 'item-list', LIBCXX_CONFORMANCE_PROJECT, '--owner', 'llvm', '--format', 'json', '--limit', '9999999']
+    project_info = json.loads(subprocess.check_output(gh_command_line))
+    from_github = [PaperInfo.from_github_issue(i) for i in project_info['items']]
+
+    for filename in CSV_FILES_TO_SYNC:
+        print(f"Synchronizing {filename} with Github issues")
+        file = libcxx_root / 'docs' / 'Status' / filename
+        csv = load_csv(file)
+        synced = sync_csv(csv, from_github)
+        write_csv(file, synced)
+
+if __name__ == '__main__':
+    main()
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
index 74e392249a94eb..83ba27783da471 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp
@@ -352,8 +352,11 @@ void GDBRemoteCommunicationClient::GetRemoteQSupported() {
 
   // build the qSupported packet
   std::vector<std::string> features = {"xmlRegisters=i386,arm,mips,arc",
-                                       "multiprocess+", "fork-events+",
-                                       "vfork-events+"};
+                                       "multiprocess+",
+                                       "fork-events+",
+                                       "vfork-events+",
+                                       "swbreak+",
+                                       "hwbreak+"};
   StreamString packet;
   packet.PutCString("qSupported");
   for (uint32_t i = 0; i < features.size(); ++i) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index a0b08a219ae147..345f5cd5de8491 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -4245,6 +4245,10 @@ std::vector<std::string> GDBRemoteCommunicationServerLLGS::HandleFeatures(
             .Case("vfork-events+", Extension::vfork)
             .Default({});
 
+  // We consume lldb's swbreak/hwbreak feature, but it doesn't change the
+  // behaviour of lldb-server. We always adjust the program counter for targets
+  // like x86
+
   m_extensions_supported &= plugin_features;
 
   // fork & vfork require multiprocess
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 6f9c2cc1e4b4e8..c7ce368ab41ce2 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -2354,6 +2354,9 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         if (!key.getAsInteger(16, reg))
           expedited_register_map[reg] = std::string(std::move(value));
       }
+      // swbreak and hwbreak are also expected keys, but we don't need to
+      // change our behaviour for them because lldb always expects the remote
+      // to adjust the program counter (if relevant, e.g., for x86 targets)
     }
 
     if (stop_pid != LLDB_INVALID_PROCESS_ID && stop_pid != pid) {
diff --git a/lldb/test/API/commands/expression/import-std-module/shared_ptr/TestSharedPtrFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/shared_ptr/TestSharedPtrFromStdModule.py
index f8e746e94730d7..d072d4f84d4544 100644
--- a/lldb/test/API/commands/expression/import-std-module/shared_ptr/TestSharedPtrFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/shared_ptr/TestSharedPtrFromStdModule.py
@@ -10,6 +10,7 @@
 class TestSharedPtr(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIf(compiler="clang", compiler_version=['<', '17.0'])
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/weak_ptr-dbg-info-content/TestDbgInfoContentWeakPtrFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/weak_ptr-dbg-info-content/TestDbgInfoContentWeakPtrFromStdModule.py
index 5ba8acc6774472..3da93914f3456d 100644
--- a/lldb/test/API/commands/expression/import-std-module/weak_ptr-dbg-info-content/TestDbgInfoContentWeakPtrFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/weak_ptr-dbg-info-content/TestDbgInfoContentWeakPtrFromStdModule.py
@@ -10,6 +10,7 @@
 class TestDbgInfoContentWeakPtr(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIf(compiler="clang", compiler_version=['<', '17.0'])
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/weak_ptr/TestWeakPtrFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/weak_ptr/TestWeakPtrFromStdModule.py
index f4a230ce26f3fd..3363c8c9dc87b2 100644
--- a/lldb/test/API/commands/expression/import-std-module/weak_ptr/TestWeakPtrFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/weak_ptr/TestWeakPtrFromStdModule.py
@@ -10,6 +10,7 @@
 class TestSharedPtr(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIf(compiler="clang", compiler_version=['<', '17.0'])
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestStopPCs.py b/lldb/test/API/functionalities/gdb_remote_client/TestStopPCs.py
index ef28cc95f7ad4b..3faae5fec38ba1 100644
--- a/lldb/test/API/functionalities/gdb_remote_client/TestStopPCs.py
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestStopPCs.py
@@ -10,13 +10,17 @@ class TestStopPCs(GDBRemoteTestBase):
     def test(self):
         class MyResponder(MockGDBServerResponder):
             def haltReason(self):
-                return "T02thread:1ff0d;threads:1ff0d,2ff0d;thread-pcs:10001bc00,10002bc00;"
+                # lldb should treat the default halt reason, hwbreak and swbreak in the same way. Which is that it
+                # expects the stub to have corrected the PC already, so lldb should not modify it further.
+                return "T02thread:1ff0d;threads:1ff0d,2ff0d,3ff0d;thread-pcs:10001bc00,10002bc00,10003bc00;"
 
             def threadStopInfo(self, threadnum):
                 if threadnum == 0x1FF0D:
-                    return "T02thread:1ff0d;threads:1ff0d,2ff0d;thread-pcs:10001bc00,10002bc00;"
+                    return "T02thread:1ff0d;threads:1ff0d,2ff0d,3ff0d;thread-pcs:10001bc00,10002bc00,10003bc00;"
                 if threadnum == 0x2FF0D:
-                    return "T00thread:2ff0d;threads:1ff0d,2ff0d;thread-pcs:10001bc00,10002bc00;"
+                    return "T00swbreak:;thread:2ff0d;threads:1ff0d,2ff0d,3ff0d;thread-pcs:10001bc00,10002bc00,10003bc00;"
+                if threadnum == 0x3FF0D:
+                    return "T00hwbreak:;thread:3ff0d;threads:1ff0d,2ff0d,3ff0d;thread-pcs:10001bc00,10002bc00,10003bc00;"
 
             def qXferRead(self, obj, annex, offset, length):
                 if annex == "target.xml":
@@ -40,10 +44,13 @@ def qXferRead(self, obj, annex, offset, length):
             self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets"))
         process = self.connect(target)
 
-        self.assertEqual(process.GetNumThreads(), 2)
+        self.assertEqual(process.GetNumThreads(), 3)
         th0 = process.GetThreadAtIndex(0)
         th1 = process.GetThreadAtIndex(1)
+        th2 = process.GetThreadAtIndex(2)
         self.assertEqual(th0.GetThreadID(), 0x1FF0D)
         self.assertEqual(th1.GetThreadID(), 0x2FF0D)
+        self.assertEqual(th2.GetThreadID(), 0x3FF0D)
         self.assertEqual(th0.GetFrameAtIndex(0).GetPC(), 0x10001BC00)
         self.assertEqual(th1.GetFrameAtIndex(0).GetPC(), 0x10002BC00)
+        self.assertEqual(th2.GetFrameAtIndex(0).GetPC(), 0x10003BC00)
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 3e7e3a965559af..99959ecfae9cba 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1071,7 +1071,7 @@ macro(add_llvm_executable name)
 
   if (DEFINED LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES AND
       NOT LLVM_ENABLE_EXPORTED_SYMBOLS_IN_EXECUTABLES AND
-      NOT ARG_EXPORT_SYMBOLS AND NOT ARG_EXPORT_SYMBOLS_FOR_PLUGINS)
+      NOT ARG_EXPORT_SYMBOLS)
     if(LLVM_LINKER_SUPPORTS_NO_EXPORTED_SYMBOLS)
       set_property(TARGET ${name} APPEND_STRING PROPERTY
         LINK_FLAGS " -Wl,-no_exported_symbols")
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 2a80813999ea1e..838447f483e510 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -218,6 +218,11 @@ used variables that control features of LLVM and enabled subprojects.
   If you are using an IDE such as Visual Studio or Xcode, you should use
   the IDE settings to set the build type.
 
+  Note: on Windows (building with MSVC or clang-cl), CMake's **RelWithDebInfo**
+  setting does not enable the same optimizations as **Release**. Using the
+  **Release** build type with :ref:`LLVM_ENABLE_PDB <llvm_enable_pdb>` set
+  may be a better option.
+
 **CMAKE_INSTALL_PREFIX**:PATH
   Path where LLVM will be installed when the "install" target is built.
 
@@ -548,6 +553,12 @@ enabled sub-projects. Nearly all of these variable names begin with
   Compile with `Clang Header Modules
   <https://clang.llvm.org/docs/Modules.html>`_.
 
+.. _llvm_enable_pdb:
+
+**LLVM_ENABLE_PDB**:BOOL
+  For Windows builds using MSVC or clang-cl, generate PDB files when
+  :ref:`CMAKE_BUILD_TYPE <cmake_build_type>` is set to Release.
+
 **LLVM_ENABLE_PEDANTIC**:BOOL
   Enable pedantic mode. This disables compiler-specific extensions, if
   possible. Defaults to ON.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index c2bc1353ee8838..279cfb5aa47d6f 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -525,7 +525,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         TM.getCodeModel() == CodeModel::Large)
       return false;
 
-    Triple TargetTriple = TM.getTargetTriple();
+    const Triple &TargetTriple = TM.getTargetTriple();
     if (!TargetTriple.isArch64Bit())
       return false;
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 1d0124ec755352..9ce3c48a7f76cb 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -675,10 +675,7 @@ class SelectionDAG {
                       bool isTarget = false, bool isOpaque = false);
 
   SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
-                             bool IsOpaque = false) {
-    return getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()), DL, VT,
-                       IsTarget, IsOpaque);
-  }
+                             bool IsOpaque = false);
 
   SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
                       bool isTarget = false, bool isOpaque = false);
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 111384b302df68..5a657f125265ea 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -16,7 +16,7 @@
 
 /* Indicate that this is LLVM compiled from the amd-gfx branch. */
 #define LLVM_HAVE_BRANCH_AMD_GFX
-#define LLVM_MAIN_REVISION 508242
+#define LLVM_MAIN_REVISION 508336
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 12ca1b1b9aa979..f8ec0971517a92 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -304,8 +304,13 @@ template <> struct GraphTraits<BasicBlock*> {
   static NodeRef getEntryNode(BasicBlock *BB) { return BB; }
   static ChildIteratorType child_begin(NodeRef N) { return succ_begin(N); }
   static ChildIteratorType child_end(NodeRef N) { return succ_end(N); }
+
+  static unsigned getNumber(const BasicBlock *BB) { return BB->getNumber(); }
 };
 
+static_assert(GraphHasNodeNumbers<BasicBlock *>,
+              "GraphTraits getNumber() not detected");
+
 template <> struct GraphTraits<const BasicBlock*> {
   using NodeRef = const BasicBlock *;
   using ChildIteratorType = const_succ_iterator;
@@ -314,8 +319,13 @@ template <> struct GraphTraits<const BasicBlock*> {
 
   static ChildIteratorType child_begin(NodeRef N) { return succ_begin(N); }
   static ChildIteratorType child_end(NodeRef N) { return succ_end(N); }
+
+  static unsigned getNumber(const BasicBlock *BB) { return BB->getNumber(); }
 };
 
+static_assert(GraphHasNodeNumbers<const BasicBlock *>,
+              "GraphTraits getNumber() not detected");
+
 // Provide specializations of GraphTraits to be able to treat a function as a
 // graph of basic blocks... and to walk it in inverse order.  Inverse order for
 // a function is considered to be when traversing the predecessor edges of a BB
@@ -328,8 +338,13 @@ template <> struct GraphTraits<Inverse<BasicBlock*>> {
   static NodeRef getEntryNode(Inverse<BasicBlock *> G) { return G.Graph; }
   static ChildIteratorType child_begin(NodeRef N) { return pred_begin(N); }
   static ChildIteratorType child_end(NodeRef N) { return pred_end(N); }
+
+  static unsigned getNumber(const BasicBlock *BB) { return BB->getNumber(); }
 };
 
+static_assert(GraphHasNodeNumbers<Inverse<BasicBlock *>>,
+              "GraphTraits getNumber() not detected");
+
 template <> struct GraphTraits<Inverse<const BasicBlock*>> {
   using NodeRef = const BasicBlock *;
   using ChildIteratorType = const_pred_iterator;
@@ -337,8 +352,13 @@ template <> struct GraphTraits<Inverse<const BasicBlock*>> {
   static NodeRef getEntryNode(Inverse<const BasicBlock *> G) { return G.Graph; }
   static ChildIteratorType child_begin(NodeRef N) { return pred_begin(N); }
   static ChildIteratorType child_end(NodeRef N) { return pred_end(N); }
+
+  static unsigned getNumber(const BasicBlock *BB) { return BB->getNumber(); }
 };
 
+static_assert(GraphHasNodeNumbers<Inverse<const BasicBlock *>>,
+              "GraphTraits getNumber() not detected");
+
 //===--------------------------------------------------------------------===//
 // GraphTraits specializations for function basic block graphs (CFGs)
 //===--------------------------------------------------------------------===//
@@ -362,6 +382,13 @@ template <> struct GraphTraits<Function*> : public GraphTraits<BasicBlock*> {
   }
 
   static size_t size(Function *F) { return F->size(); }
+
+  static unsigned getMaxNumber(const Function *F) {
+    return F->getMaxBlockNumber();
+  }
+  static unsigned getNumberEpoch(const Function *F) {
+    return F->getBlockNumberEpoch();
+  }
 };
 template <> struct GraphTraits<const Function*> :
   public GraphTraits<const BasicBlock*> {
@@ -379,6 +406,13 @@ template <> struct GraphTraits<const Function*> :
   }
 
   static size_t size(const Function *F) { return F->size(); }
+
+  static unsigned getMaxNumber(const Function *F) {
+    return F->getMaxBlockNumber();
+  }
+  static unsigned getNumberEpoch(const Function *F) {
+    return F->getBlockNumberEpoch();
+  }
 };
 
 // Provide specializations of GraphTraits to be able to treat a function as a
@@ -391,12 +425,26 @@ template <> struct GraphTraits<Inverse<Function*>> :
   static NodeRef getEntryNode(Inverse<Function *> G) {
     return &G.Graph->getEntryBlock();
   }
+
+  static unsigned getMaxNumber(const Function *F) {
+    return F->getMaxBlockNumber();
+  }
+  static unsigned getNumberEpoch(const Function *F) {
+    return F->getBlockNumberEpoch();
+  }
 };
 template <> struct GraphTraits<Inverse<const Function*>> :
   public GraphTraits<Inverse<const BasicBlock*>> {
   static NodeRef getEntryNode(Inverse<const Function *> G) {
     return &G.Graph->getEntryBlock();
   }
+
+  static unsigned getMaxNumber(const Function *F) {
+    return F->getMaxBlockNumber();
+  }
+  static unsigned getNumberEpoch(const Function *F) {
+    return F->getBlockNumberEpoch();
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 8e6a8f62b3a048..f4b4b730bee2ae 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -45,7 +45,6 @@ namespace llvm {
 
 class GlobalVariable;
 class LLVMContext;
-class Module;
 class StructLayout;
 class Triple;
 class Value;
@@ -186,17 +185,10 @@ class DataLayout {
   /// if the string is malformed.
   Error parseSpecifier(StringRef Desc);
 
-  // Free all internal data structures.
-  void clear();
-
 public:
-  /// Constructs a DataLayout from a specification string. See reset().
-  explicit DataLayout(StringRef LayoutDescription) {
-    reset(LayoutDescription);
-  }
-
-  /// Initialize target data from properties stored in the module.
-  explicit DataLayout(const Module *M);
+  /// Constructs a DataLayout from a specification string.
+  /// WARNING: Aborts execution if the string is malformed. Use parse() instead.
+  explicit DataLayout(StringRef LayoutString);
 
   DataLayout(const DataLayout &DL) { *this = DL; }
 
@@ -207,11 +199,6 @@ class DataLayout {
   bool operator==(const DataLayout &Other) const;
   bool operator!=(const DataLayout &Other) const { return !(*this == Other); }
 
-  void init(const Module *M);
-
-  /// Parse a data layout string (with fallback to default values).
-  void reset(StringRef LayoutDescription);
-
   /// Parse a data layout string and return the layout. Return an error
   /// description on failure.
   static Expected<DataLayout> parse(StringRef LayoutDescription);
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index 7e2b68e6faea29..45ef38b965b752 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -397,6 +397,8 @@ class DominatorTreeBase {
   /// may (but is not required to) be null for a forward (backwards)
   /// statically unreachable block.
   DomTreeNodeBase<NodeT> *getNode(const NodeT *BB) const {
+    assert((!BB || Parent == NodeTrait::getParent(const_cast<NodeT *>(BB))) &&
+           "cannot get DomTreeNode of block with different parent");
     if (auto Idx = getNodeIndex(BB); Idx && *Idx < DomTreeNodes.size())
       return DomTreeNodes[*Idx].get();
     return nullptr;
diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
index 32bf7b8c96be3d..5132dca7c6a22b 100644
--- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h
@@ -449,9 +449,6 @@ SampleProfileLoaderBaseImpl<BT>::getInstWeightImpl(const InstructionT &Inst) {
   return R;
 }
 
-// Here use error_code to represent: 1) The dangling probe. 2) Ignore the weight
-// of non-probe instruction. So if all instructions of the BB give error_code,
-// tell the inference algorithm to infer the BB weight.
 template <typename BT>
 ErrorOr<uint64_t>
 SampleProfileLoaderBaseImpl<BT>::getProbeWeight(const InstructionT &Inst) {
@@ -464,17 +461,13 @@ SampleProfileLoaderBaseImpl<BT>::getProbeWeight(const InstructionT &Inst) {
     return std::error_code();
 
   const FunctionSamples *FS = findFunctionSamples(Inst);
-  // If none of the instruction has FunctionSample, we choose to return zero
-  // value sample to indicate the BB is cold. This could happen when the
-  // instruction is from inlinee and no profile data is found.
-  // FIXME: This should not be affected by the source drift issue as 1) if the
-  // newly added function is top-level inliner, it won't match the CFG checksum
-  // in the function profile or 2) if it's the inlinee, the inlinee should have
-  // a profile, otherwise it wouldn't be inlined. For non-probe based profile,
-  // we can improve it by adding a switch for profile-sample-block-accurate for
-  // block level counts in the future.
-  if (!FS)
-    return 0;
+  if (!FS) {
+    // If we can't find the function samples for a probe, it could be due to the
+    // probe is later optimized away or the inlining context is mismatced. We
+    // treat it as unknown, leaving it to profile inference instead of forcing a
+    // zero count.
+    return std::error_code();
+  }
 
   auto R = FS->findSamplesAt(Probe->Id, Probe->Discriminator);
   if (R) {
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 345e5a0195201c..4b65fa0ae41b2f 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -3246,8 +3246,7 @@ InlineCostAnnotationPrinterPass::run(Function &F,
   };
   Module *M = F.getParent();
   ProfileSummaryInfo PSI(*M);
-  DataLayout DL(M);
-  TargetTransformInfo TTI(DL);
+  TargetTransformInfo TTI(M->getDataLayout());
   // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
   // In the current implementation, the type of InlineParams doesn't matter as
   // the pass serves only for verification of inliner's decisions.
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 53ed60f51fde06..e1abf5e4d885ec 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -660,7 +660,8 @@ Value *llvm::lowerObjectSizeCall(
   if (!MustSucceed)
     return nullptr;
 
-  return ConstantInt::get(ResultType, MaxVal ? -1ULL : 0);
+  return MaxVal ? Constant::getAllOnesValue(ResultType)
+                : Constant::getNullValue(ResultType);
 }
 
 STATISTIC(ObjectVisitorArgument,
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 9f7baa983f1229..79504ca7b73c8f 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -291,10 +291,6 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
   if (isa<GlobalValue>(LoadOperand))
     return MemDepResult::getUnknown();
 
-  // Queue to process all pointers that are equivalent to load operand.
-  SmallVector<const Value *, 8> LoadOperandsQueue;
-  LoadOperandsQueue.push_back(LoadOperand);
-
   Instruction *ClosestDependency = nullptr;
   // Order of instructions in uses list is unpredictible. In order to always
   // get the same result, we will look for the closest dominance.
@@ -305,44 +301,19 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
     return Best;
   };
 
-  // FIXME: This loop is O(N^2) because dominates can be O(n) and in worst case
-  // we will see all the instructions. This should be fixed in MSSA.
-  while (!LoadOperandsQueue.empty()) {
-    const Value *Ptr = LoadOperandsQueue.pop_back_val();
-    assert(Ptr && !isa<GlobalValue>(Ptr) &&
-           "Null or GlobalValue should not be inserted");
-
-    for (const Use &Us : Ptr->uses()) {
-      auto *U = dyn_cast<Instruction>(Us.getUser());
-      if (!U || U == LI || !DT.dominates(U, LI))
-        continue;
-
-      // Bitcast or gep with zeros are using Ptr. Add to queue to check it's
-      // users.      U = bitcast Ptr
-      if (isa<BitCastInst>(U)) {
-        LoadOperandsQueue.push_back(U);
-        continue;
-      }
-      // Gep with zeros is equivalent to bitcast.
-      // FIXME: we are not sure if some bitcast should be canonicalized to gep 0
-      // or gep 0 to bitcast because of SROA, so there are 2 forms. When
-      // typeless pointers will be ready then both cases will be gone
-      // (and this BFS also won't be needed).
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(U))
-        if (GEP->hasAllZeroIndices()) {
-          LoadOperandsQueue.push_back(U);
-          continue;
-        }
+  for (const Use &Us : LoadOperand->uses()) {
+    auto *U = dyn_cast<Instruction>(Us.getUser());
+    if (!U || U == LI || !DT.dominates(U, LI))
+      continue;
 
-      // If we hit load/store with the same invariant.group metadata (and the
-      // same pointer operand) we can assume that value pointed by pointer
-      // operand didn't change.
-      if ((isa<LoadInst>(U) ||
-           (isa<StoreInst>(U) &&
-            cast<StoreInst>(U)->getPointerOperand() == Ptr)) &&
-          U->hasMetadata(LLVMContext::MD_invariant_group))
-        ClosestDependency = GetClosestDependency(ClosestDependency, U);
-    }
+    // If we hit load/store with the same invariant.group metadata (and the
+    // same pointer operand) we can assume that value pointed by pointer
+    // operand didn't change.
+    if ((isa<LoadInst>(U) ||
+         (isa<StoreInst>(U) &&
+          cast<StoreInst>(U)->getPointerOperand() == LoadOperand)) &&
+        U->hasMetadata(LLVMContext::MD_invariant_group))
+      ClosestDependency = GetClosestDependency(ClosestDependency, U);
   }
 
   if (!ClosestDependency)
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 48ef73e59045e7..1583e0e31efc14 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -2507,45 +2507,22 @@ getInvariantGroupClobberingInstruction(Instruction &I, DominatorTree &DT) {
   if (isa<Constant>(PointerOperand))
     return nullptr;
 
-  // Queue to process all pointers that are equivalent to load operand.
-  SmallVector<const Value *, 8> PointerUsesQueue;
-  PointerUsesQueue.push_back(PointerOperand);
-
   const Instruction *MostDominatingInstruction = &I;
 
-  // FIXME: This loop is O(n^2) because dominates can be O(n) and in worst case
-  // we will see all the instructions. It may not matter in practice. If it
-  // does, we will have to support MemorySSA construction and updates.
-  while (!PointerUsesQueue.empty()) {
-    const Value *Ptr = PointerUsesQueue.pop_back_val();
-    assert(Ptr && !isa<GlobalValue>(Ptr) &&
-           "Null or GlobalValue should not be inserted");
-
-    for (const User *Us : Ptr->users()) {
-      auto *U = dyn_cast<Instruction>(Us);
-      if (!U || U == &I || !DT.dominates(U, MostDominatingInstruction))
-        continue;
-
-      // Add bitcasts and zero GEPs to queue.
-      if (isa<BitCastInst>(U)) {
-        PointerUsesQueue.push_back(U);
-        continue;
-      }
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
-        if (GEP->hasAllZeroIndices())
-          PointerUsesQueue.push_back(U);
-        continue;
-      }
+  for (const User *Us : PointerOperand->users()) {
+    auto *U = dyn_cast<Instruction>(Us);
+    if (!U || U == &I || !DT.dominates(U, MostDominatingInstruction))
+      continue;
 
-      // If we hit a load/store with an invariant.group metadata and the same
-      // pointer operand, we can assume that value pointed to by the pointer
-      // operand didn't change.
-      if (U->hasMetadata(LLVMContext::MD_invariant_group) &&
-          getLoadStorePointerOperand(U) == Ptr && !U->isVolatile()) {
-        MostDominatingInstruction = U;
-      }
+    // If we hit a load/store with an invariant.group metadata and the same
+    // pointer operand, we can assume that value pointed to by the pointer
+    // operand didn't change.
+    if (U->hasMetadata(LLVMContext::MD_invariant_group) &&
+        getLoadStorePointerOperand(U) == PointerOperand && !U->isVolatile()) {
+      MostDominatingInstruction = U;
     }
   }
+
   return MostDominatingInstruction == &I ? nullptr : MostDominatingInstruction;
 }
 
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index de220d68a59163..9a568a252f8d27 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11951,62 +11951,94 @@ ScalarEvolution::computeConstantDifference(const SCEV *More, const SCEV *Less) {
   // We avoid subtracting expressions here because this function is usually
   // fairly deep in the call stack (i.e. is called many times).
 
-  // X - X = 0.
   unsigned BW = getTypeSizeInBits(More->getType());
-  if (More == Less)
-    return APInt(BW, 0);
-
-  if (isa<SCEVAddRecExpr>(Less) && isa<SCEVAddRecExpr>(More)) {
-    const auto *LAR = cast<SCEVAddRecExpr>(Less);
-    const auto *MAR = cast<SCEVAddRecExpr>(More);
-
-    if (LAR->getLoop() != MAR->getLoop())
-      return std::nullopt;
-
-    // We look at affine expressions only; not for correctness but to keep
-    // getStepRecurrence cheap.
-    if (!LAR->isAffine() || !MAR->isAffine())
-      return std::nullopt;
+  APInt Diff(BW, 0);
+  // Try various simplifications to reduce the difference to a constant. Limit
+  // the number of allowed simplifications to keep compile-time low.
+  for (unsigned I = 0; I < 4; ++I) {
+    if (More == Less)
+      return Diff;
+
+    // Reduce addrecs with identical steps to their start value.
+    if (isa<SCEVAddRecExpr>(Less) && isa<SCEVAddRecExpr>(More)) {
+      const auto *LAR = cast<SCEVAddRecExpr>(Less);
+      const auto *MAR = cast<SCEVAddRecExpr>(More);
+
+      if (LAR->getLoop() != MAR->getLoop())
+        return std::nullopt;
+
+      // We look at affine expressions only; not for correctness but to keep
+      // getStepRecurrence cheap.
+      if (!LAR->isAffine() || !MAR->isAffine())
+        return std::nullopt;
+
+      if (LAR->getStepRecurrence(*this) != MAR->getStepRecurrence(*this))
+        return std::nullopt;
+
+      Less = LAR->getStart();
+      More = MAR->getStart();
+      continue;
+    }
 
-    if (LAR->getStepRecurrence(*this) != MAR->getStepRecurrence(*this))
+    // Try to cancel out common factors in two add expressions.
+    SmallDenseMap<const SCEV *, int, 8> Multiplicity;
+    auto Add = [&](const SCEV *S, int Mul) {
+      if (auto *C = dyn_cast<SCEVConstant>(S)) {
+        if (Mul == 1) {
+          Diff += C->getAPInt();
+        } else {
+          assert(Mul == -1);
+          Diff -= C->getAPInt();
+        }
+      } else
+        Multiplicity[S] += Mul;
+    };
+    auto Decompose = [&](const SCEV *S, int Mul) {
+      if (isa<SCEVAddExpr>(S)) {
+        for (const SCEV *Op : S->operands())
+          Add(Op, Mul);
+      } else
+        Add(S, Mul);
+    };
+    Decompose(More, 1);
+    Decompose(Less, -1);
+
+    // Check whether all the non-constants cancel out, or reduce to new
+    // More/Less values.
+    const SCEV *NewMore = nullptr, *NewLess = nullptr;
+    for (const auto &[S, Mul] : Multiplicity) {
+      if (Mul == 0)
+        continue;
+      if (Mul == 1) {
+        if (NewMore)
+          return std::nullopt;
+        NewMore = S;
+      } else if (Mul == -1) {
+        if (NewLess)
+          return std::nullopt;
+        NewLess = S;
+      } else
+        return std::nullopt;
+    }
+
+    // Values stayed the same, no point in trying further.
+    if (NewMore == More || NewLess == Less)
       return std::nullopt;
 
-    Less = LAR->getStart();
-    More = MAR->getStart();
+    More = NewMore;
+    Less = NewLess;
 
-    // fall through
-  }
+    // Reduced to constant.
+    if (!More && !Less)
+      return Diff;
 
-  // Try to cancel out common factors in two add expressions.
-  SmallDenseMap<const SCEV *, int, 8> Multiplicity;
-  APInt Diff(BW, 0);
-  auto Add = [&](const SCEV *S, int Mul) {
-    if (auto *C = dyn_cast<SCEVConstant>(S)) {
-      if (Mul == 1) {
-        Diff += C->getAPInt();
-      } else {
-        assert(Mul == -1);
-        Diff -= C->getAPInt();
-      }
-    } else
-      Multiplicity[S] += Mul;
-  };
-  auto Decompose = [&](const SCEV *S, int Mul) {
-    if (isa<SCEVAddExpr>(S)) {
-      for (const SCEV *Op : S->operands())
-        Add(Op, Mul);
-    } else
-      Add(S, Mul);
-  };
-  Decompose(More, 1);
-  Decompose(Less, -1);
-
-  // Check whether all the non-constants cancel out.
-  for (const auto &[_, Mul] : Multiplicity)
-    if (Mul != 0)
+    // Left with variable on only one side, bail out.
+    if (!More || !Less)
       return std::nullopt;
+  }
 
-  return Diff;
+  // Did not reduce to constant.
+  return std::nullopt;
 }
 
 bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
diff --git a/llvm/lib/Analysis/TypeMetadataUtils.cpp b/llvm/lib/Analysis/TypeMetadataUtils.cpp
index 67ce1540112bb7..9ec0785eb5034d 100644
--- a/llvm/lib/Analysis/TypeMetadataUtils.cpp
+++ b/llvm/lib/Analysis/TypeMetadataUtils.cpp
@@ -33,6 +33,8 @@ findCallsAtConstantOffset(SmallVectorImpl<DevirtCallSite> &DevirtCalls,
     // after indirect call promotion and inlining, where we may have uses
     // of the vtable pointer guarded by a function pointer check, and a fallback
     // indirect call.
+    if (CI->getFunction() != User->getFunction())
+      continue;
     if (!DT.dominates(CI, User))
       continue;
     if (isa<BitCastInst>(User)) {
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 146276b4fd0bb2..788cdfe3bb13d9 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -2826,13 +2826,12 @@ bool AssignmentTrackingAnalysis::runOnFunction(Function &F) {
 
   LLVM_DEBUG(dbgs() << "AssignmentTrackingAnalysis run on " << F.getName()
                     << "\n");
-  auto DL = std::make_unique<DataLayout>(F.getParent());
 
   // Clear previous results.
   Results->clear();
 
   FunctionVarLocsBuilder Builder;
-  analyzeFunction(F, *DL.get(), &Builder);
+  analyzeFunction(F, F.getDataLayout(), &Builder);
 
   // Save these results.
   Results->init(Builder);
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 49836b914784fc..d8f33c42a8a14c 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -119,6 +119,8 @@ class AtomicExpandImpl {
   llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
                                  CreateCmpXchgInstFun CreateCmpXchg);
 
+  bool processAtomicInstr(Instruction *I);
+
 public:
   bool run(Function &F, const TargetMachine *TM);
 };
@@ -203,6 +205,143 @@ static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
          Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
 }
 
+bool AtomicExpandImpl::processAtomicInstr(Instruction *I) {
+  auto *LI = dyn_cast<LoadInst>(I);
+  auto *SI = dyn_cast<StoreInst>(I);
+  auto *RMWI = dyn_cast<AtomicRMWInst>(I);
+  auto *CASI = dyn_cast<AtomicCmpXchgInst>(I);
+
+  bool MadeChange = false;
+
+  // If the Size/Alignment is not supported, replace with a libcall.
+  if (LI) {
+    if (!LI->isAtomic())
+      return false;
+
+    if (!atomicSizeSupported(TLI, LI)) {
+      expandAtomicLoadToLibcall(LI);
+      return true;
+    }
+
+    if (TLI->shouldCastAtomicLoadInIR(LI) ==
+        TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
+      I = LI = convertAtomicLoadToIntegerType(LI);
+      MadeChange = true;
+    }
+  } else if (SI) {
+    if (!SI->isAtomic())
+      return false;
+
+    if (!atomicSizeSupported(TLI, SI)) {
+      expandAtomicStoreToLibcall(SI);
+      return true;
+    }
+
+    if (TLI->shouldCastAtomicStoreInIR(SI) ==
+        TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
+      I = SI = convertAtomicStoreToIntegerType(SI);
+      MadeChange = true;
+    }
+  } else if (RMWI) {
+    if (!atomicSizeSupported(TLI, RMWI)) {
+      expandAtomicRMWToLibcall(RMWI);
+      return true;
+    }
+
+    if (TLI->shouldCastAtomicRMWIInIR(RMWI) ==
+        TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
+      I = RMWI = convertAtomicXchgToIntegerType(RMWI);
+      MadeChange = true;
+    }
+  } else if (CASI) {
+    if (!atomicSizeSupported(TLI, CASI)) {
+      expandAtomicCASToLibcall(CASI);
+      return true;
+    }
+
+    // TODO: when we're ready to make the change at the IR level, we can
+    // extend convertCmpXchgToInteger for floating point too.
+    if (CASI->getCompareOperand()->getType()->isPointerTy()) {
+      // TODO: add a TLI hook to control this so that each target can
+      // convert to lowering the original type one at a time.
+      I = CASI = convertCmpXchgToIntegerType(CASI);
+      MadeChange = true;
+    }
+  } else
+    return false;
+
+  if (TLI->shouldInsertFencesForAtomic(I)) {
+    auto FenceOrdering = AtomicOrdering::Monotonic;
+    if (LI && isAcquireOrStronger(LI->getOrdering())) {
+      FenceOrdering = LI->getOrdering();
+      LI->setOrdering(AtomicOrdering::Monotonic);
+    } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
+      FenceOrdering = SI->getOrdering();
+      SI->setOrdering(AtomicOrdering::Monotonic);
+    } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
+                        isAcquireOrStronger(RMWI->getOrdering()))) {
+      FenceOrdering = RMWI->getOrdering();
+      RMWI->setOrdering(AtomicOrdering::Monotonic);
+    } else if (CASI &&
+               TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
+                   TargetLoweringBase::AtomicExpansionKind::None &&
+               (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
+                isAcquireOrStronger(CASI->getSuccessOrdering()) ||
+                isAcquireOrStronger(CASI->getFailureOrdering()))) {
+      // If a compare and swap is lowered to LL/SC, we can do smarter fence
+      // insertion, with a stronger one on the success path than on the
+      // failure path. As a result, fence insertion is directly done by
+      // expandAtomicCmpXchg in that case.
+      FenceOrdering = CASI->getMergedOrdering();
+      CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
+      CASI->setFailureOrdering(AtomicOrdering::Monotonic);
+    }
+
+    if (FenceOrdering != AtomicOrdering::Monotonic) {
+      MadeChange |= bracketInstWithFences(I, FenceOrdering);
+    }
+  } else if (I->hasAtomicStore() &&
+             TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
+    auto FenceOrdering = AtomicOrdering::Monotonic;
+    if (SI)
+      FenceOrdering = SI->getOrdering();
+    else if (RMWI)
+      FenceOrdering = RMWI->getOrdering();
+    else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
+                         TargetLoweringBase::AtomicExpansionKind::LLSC)
+      // LLSC is handled in expandAtomicCmpXchg().
+      FenceOrdering = CASI->getSuccessOrdering();
+
+    IRBuilder Builder(I);
+    if (auto TrailingFence =
+            TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
+      TrailingFence->moveAfter(I);
+      MadeChange = true;
+    }
+  }
+
+  if (LI)
+    MadeChange |= tryExpandAtomicLoad(LI);
+  else if (SI)
+    MadeChange |= tryExpandAtomicStore(SI);
+  else if (RMWI) {
+    // There are two different ways of expanding RMW instructions:
+    // - into a load if it is idempotent
+    // - into a Cmpxchg/LL-SC loop otherwise
+    // we try them in that order.
+
+    if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
+      MadeChange = true;
+
+    } else {
+      MadeChange |= tryExpandAtomicRMW(RMWI);
+    }
+  } else if (CASI)
+    MadeChange |= tryExpandAtomicCmpXchg(CASI);
+
+  return MadeChange;
+}
+
 bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
   const auto *Subtarget = TM->getSubtargetImpl(F);
   if (!Subtarget->enableAtomicExpand())
@@ -210,6 +349,8 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
   TLI = Subtarget->getTargetLowering();
   DL = &F.getDataLayout();
 
+  bool MadeChange = false;
+
   SmallVector<Instruction *, 1> AtomicInsts;
 
   // Changing control-flow while iterating through it is a bad idea, so gather a
@@ -218,134 +359,11 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
     if (I.isAtomic() && !isa<FenceInst>(&I))
       AtomicInsts.push_back(&I);
 
-  bool MadeChange = false;
   for (auto *I : AtomicInsts) {
-    auto LI = dyn_cast<LoadInst>(I);
-    auto SI = dyn_cast<StoreInst>(I);
-    auto RMWI = dyn_cast<AtomicRMWInst>(I);
-    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
-    assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
-
-    // If the Size/Alignment is not supported, replace with a libcall.
-    if (LI) {
-      if (!atomicSizeSupported(TLI, LI)) {
-        expandAtomicLoadToLibcall(LI);
-        MadeChange = true;
-        continue;
-      }
-    } else if (SI) {
-      if (!atomicSizeSupported(TLI, SI)) {
-        expandAtomicStoreToLibcall(SI);
-        MadeChange = true;
-        continue;
-      }
-    } else if (RMWI) {
-      if (!atomicSizeSupported(TLI, RMWI)) {
-        expandAtomicRMWToLibcall(RMWI);
-        MadeChange = true;
-        continue;
-      }
-    } else if (CASI) {
-      if (!atomicSizeSupported(TLI, CASI)) {
-        expandAtomicCASToLibcall(CASI);
-        MadeChange = true;
-        continue;
-      }
-    }
-
-    if (LI && TLI->shouldCastAtomicLoadInIR(LI) ==
-                  TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
-      I = LI = convertAtomicLoadToIntegerType(LI);
+    if (processAtomicInstr(I))
       MadeChange = true;
-    } else if (SI &&
-               TLI->shouldCastAtomicStoreInIR(SI) ==
-                   TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
-      I = SI = convertAtomicStoreToIntegerType(SI);
-      MadeChange = true;
-    } else if (RMWI &&
-               TLI->shouldCastAtomicRMWIInIR(RMWI) ==
-                   TargetLoweringBase::AtomicExpansionKind::CastToInteger) {
-      I = RMWI = convertAtomicXchgToIntegerType(RMWI);
-      MadeChange = true;
-    } else if (CASI) {
-      // TODO: when we're ready to make the change at the IR level, we can
-      // extend convertCmpXchgToInteger for floating point too.
-      if (CASI->getCompareOperand()->getType()->isPointerTy()) {
-        // TODO: add a TLI hook to control this so that each target can
-        // convert to lowering the original type one at a time.
-        I = CASI = convertCmpXchgToIntegerType(CASI);
-        MadeChange = true;
-      }
-    }
-
-    if (TLI->shouldInsertFencesForAtomic(I)) {
-      auto FenceOrdering = AtomicOrdering::Monotonic;
-      if (LI && isAcquireOrStronger(LI->getOrdering())) {
-        FenceOrdering = LI->getOrdering();
-        LI->setOrdering(AtomicOrdering::Monotonic);
-      } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
-        FenceOrdering = SI->getOrdering();
-        SI->setOrdering(AtomicOrdering::Monotonic);
-      } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
-                          isAcquireOrStronger(RMWI->getOrdering()))) {
-        FenceOrdering = RMWI->getOrdering();
-        RMWI->setOrdering(AtomicOrdering::Monotonic);
-      } else if (CASI &&
-                 TLI->shouldExpandAtomicCmpXchgInIR(CASI) ==
-                     TargetLoweringBase::AtomicExpansionKind::None &&
-                 (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
-                  isAcquireOrStronger(CASI->getSuccessOrdering()) ||
-                  isAcquireOrStronger(CASI->getFailureOrdering()))) {
-        // If a compare and swap is lowered to LL/SC, we can do smarter fence
-        // insertion, with a stronger one on the success path than on the
-        // failure path. As a result, fence insertion is directly done by
-        // expandAtomicCmpXchg in that case.
-        FenceOrdering = CASI->getMergedOrdering();
-        CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
-        CASI->setFailureOrdering(AtomicOrdering::Monotonic);
-      }
-
-      if (FenceOrdering != AtomicOrdering::Monotonic) {
-        MadeChange |= bracketInstWithFences(I, FenceOrdering);
-      }
-    } else if (I->hasAtomicStore() &&
-               TLI->shouldInsertTrailingFenceForAtomicStore(I)) {
-      auto FenceOrdering = AtomicOrdering::Monotonic;
-      if (SI)
-        FenceOrdering = SI->getOrdering();
-      else if (RMWI)
-        FenceOrdering = RMWI->getOrdering();
-      else if (CASI && TLI->shouldExpandAtomicCmpXchgInIR(CASI) !=
-                           TargetLoweringBase::AtomicExpansionKind::LLSC)
-        // LLSC is handled in expandAtomicCmpXchg().
-        FenceOrdering = CASI->getSuccessOrdering();
-
-      IRBuilder Builder(I);
-      if (auto TrailingFence =
-              TLI->emitTrailingFence(Builder, I, FenceOrdering)) {
-        TrailingFence->moveAfter(I);
-        MadeChange = true;
-      }
-    }
-
-    if (LI)
-      MadeChange |= tryExpandAtomicLoad(LI);
-    else if (SI)
-      MadeChange |= tryExpandAtomicStore(SI);
-    else if (RMWI) {
-      // There are two different ways of expanding RMW instructions:
-      // - into a load if it is idempotent
-      // - into a Cmpxchg/LL-SC loop otherwise
-      // we try them in that order.
-
-      if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
-        MadeChange = true;
-      } else {
-        MadeChange |= tryExpandAtomicRMW(RMWI);
-      }
-    } else if (CASI)
-      MadeChange |= tryExpandAtomicCmpXchg(CASI);
   }
+
   return MadeChange;
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 0169a0e466d878..219c60eab04f5a 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -3339,7 +3339,7 @@ void IRTranslator::finishPendingPhis() {
 #ifndef NDEBUG
   DILocationVerifier Verifier;
   GISelObserverWrapper WrapperObserver(&Verifier);
-  RAIIDelegateInstaller DelInstall(*MF, &WrapperObserver);
+  RAIIMFObsDelInstaller ObsInstall(*MF, WrapperObserver);
 #endif // ifndef NDEBUG
   for (auto &Phi : PendingPHIs) {
     const PHINode *PI = Phi.first;
@@ -3966,8 +3966,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     DILocationVerifier Verifier;
     WrapperObserver.addObserver(&Verifier);
 #endif // ifndef NDEBUG
-    RAIIDelegateInstaller DelInstall(*MF, &WrapperObserver);
-    RAIIMFObserverInstaller ObsInstall(*MF, WrapperObserver);
+    RAIIMFObsDelInstaller ObsInstall(*MF, WrapperObserver);
     for (const BasicBlock *BB : RPOT) {
       MachineBasicBlock &MBB = getMBB(*BB);
       // Set the insertion point of all the following translations to
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 65bf7161441bac..8aa4345cfd6df6 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -664,7 +664,7 @@ bool GlobalMergeImpl::run(Module &M) {
       continue;
 
     if (!(Opt.MergeExternal && GV.hasExternalLinkage()) &&
-        !GV.hasInternalLinkage())
+        !GV.hasLocalLinkage())
       continue;
 
     PointerType *PT = dyn_cast<PointerType>(GV.getType());
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5a21ad7ac7e2cd..ddb7c8c54bbfe4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2412,11 +2412,64 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
                                                    SDValue &Hi) {
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
-  // need to be moved to the Lo vector. So we just perform this as one "big"
-  // operation and then extract the Lo and Hi vectors from that. This gets rid
-  // of VECTOR_COMPRESS and all other operands can be legalized later.
-  SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
-  std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
+  // need to be moved to the Lo vector. Passthru values make this even harder.
+  // We try to use VECTOR_COMPRESS if the target has custom lowering with
+  // smaller types and passthru is undef, as it is most likely faster than the
+  // fully expand path. Otherwise, just do the full expansion as one "big"
+  // operation and then extract the Lo and Hi vectors from that. This gets
+  // rid of VECTOR_COMPRESS and all other operands can be legalized later.
+  SDLoc DL(N);
+  EVT VecVT = N->getValueType(0);
+
+  auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
+  bool HasCustomLowering = false;
+  EVT CheckVT = LoVT;
+  while (CheckVT.getVectorMinNumElements() > 1) {
+    // TLI.isOperationLegalOrCustom requires a legal type, but we could have a
+    // custom lowering for illegal types. So we do the checks separately.
+    if (TLI.isOperationLegal(ISD::VECTOR_COMPRESS, CheckVT) ||
+        TLI.isOperationCustom(ISD::VECTOR_COMPRESS, CheckVT)) {
+      HasCustomLowering = true;
+      break;
+    }
+    CheckVT = CheckVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  }
+
+  SDValue Passthru = N->getOperand(2);
+  if (!HasCustomLowering || !Passthru.isUndef()) {
+    SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
+    std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL, LoVT, HiVT);
+    return;
+  }
+
+  // Try to VECTOR_COMPRESS smaller vectors and combine via a stack store+load.
+  SDValue LoMask, HiMask;
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+  std::tie(LoMask, HiMask) = SplitMask(N->getOperand(1));
+
+  SDValue UndefPassthru = DAG.getUNDEF(LoVT);
+  Lo = DAG.getNode(ISD::VECTOR_COMPRESS, DL, LoVT, Lo, LoMask, UndefPassthru);
+  Hi = DAG.getNode(ISD::VECTOR_COMPRESS, DL, HiVT, Hi, HiMask, UndefPassthru);
+
+  SDValue StackPtr = DAG.CreateStackTemporary(
+      VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(
+      MF, cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex());
+
+  // We store LoVec and then insert HiVec starting at offset=|1s| in LoMask.
+  SDValue WideMask =
+      DAG.getNode(ISD::ZERO_EXTEND, DL, LoMask.getValueType(), LoMask);
+  SDValue Offset = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, WideMask);
+  Offset = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Offset);
+
+  SDValue Chain = DAG.getEntryNode();
+  Chain = DAG.getStore(Chain, DL, Lo, StackPtr, PtrInfo);
+  Chain = DAG.getStore(Chain, DL, Hi, Offset,
+                       MachinePointerInfo::getUnknownStack(MF));
+
+  SDValue Compressed = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+  std::tie(Lo, Hi) = DAG.SplitVector(Compressed, DL);
 }
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
@@ -5790,7 +5843,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) {
       TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
   EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
                                     Mask.getValueType().getVectorElementType(),
-                                    WideVecVT.getVectorNumElements());
+                                    WideVecVT.getVectorElementCount());
 
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
   SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c3a7df5361cd45..f9701b3ccbffae 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1748,6 +1748,12 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
   return Result;
 }
 
+SDValue SelectionDAG::getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget,
+                                         bool IsOpaque) {
+  return getConstant(APInt::getAllOnes(VT.getScalarSizeInBits()), DL, VT,
+                     IsTarget, IsOpaque);
+}
+
 SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
                                         bool isTarget) {
   return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
@@ -5140,12 +5146,8 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly,
   if (Op.getOpcode() == ISD::FREEZE)
     return true;
 
-  // TODO: Assume we don't know anything for now.
   EVT VT = Op.getValueType();
-  if (VT.isScalableVector())
-    return false;
-
-  APInt DemandedElts = VT.isVector()
+  APInt DemandedElts = VT.isFixedLengthVector()
                            ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return isGuaranteedNotToBeUndefOrPoison(Op, DemandedElts, PoisonOnly, Depth);
@@ -5190,6 +5192,10 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
     }
     return true;
 
+  case ISD::SPLAT_VECTOR:
+    return isGuaranteedNotToBeUndefOrPoison(Op.getOperand(0), PoisonOnly,
+                                            Depth + 1);
+
   case ISD::VECTOR_SHUFFLE: {
     APInt DemandedLHS, DemandedRHS;
     auto *SVN = cast<ShuffleVectorSDNode>(Op);
@@ -5236,12 +5242,8 @@ bool SelectionDAG::isGuaranteedNotToBeUndefOrPoison(SDValue Op,
 bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, bool PoisonOnly,
                                           bool ConsiderFlags,
                                           unsigned Depth) const {
-  // TODO: Assume we don't know anything for now.
   EVT VT = Op.getValueType();
-  if (VT.isScalableVector())
-    return true;
-
-  APInt DemandedElts = VT.isVector()
+  APInt DemandedElts = VT.isFixedLengthVector()
                            ? APInt::getAllOnes(VT.getVectorNumElements())
                            : APInt(1, 1);
   return canCreateUndefOrPoison(Op, DemandedElts, PoisonOnly, ConsiderFlags,
@@ -5251,11 +5253,6 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, bool PoisonOnly,
 bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
                                           bool PoisonOnly, bool ConsiderFlags,
                                           unsigned Depth) const {
-  // TODO: Assume we don't know anything for now.
-  EVT VT = Op.getValueType();
-  if (VT.isScalableVector())
-    return true;
-
   if (ConsiderFlags && Op->hasPoisonGeneratingFlags())
     return true;
 
@@ -5292,6 +5289,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
   case ISD::BITCAST:
   case ISD::BUILD_VECTOR:
   case ISD::BUILD_PAIR:
+  case ISD::SPLAT_VECTOR:
     return false;
 
   case ISD::SELECT_CC:
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 79e240d62a17d7..1b0012b65b80d4 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -944,13 +944,13 @@ void TargetPassConfig::addCodeGenPrepare() {
 void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
-  if (getOptLevel() != CodeGenOptLevel::None)
-    addPass(createObjCARCContractPass());
-
   // Force codegen to run according to the callgraph.
   if (requiresCodeGenSCCOrder())
     addPass(new DummyCGSCCPass);
 
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(createObjCARCContractPass());
+
   addPass(createCallBrPass());
 
   // Add both the safe stack and the stack protection passes: each of them will
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 0940759ddc421f..11aa3abe906853 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -834,11 +834,10 @@ bool TypePromotionImpl::TryToPromote(Value *V, unsigned PromotedWidth,
     // the tree has already been explored.
     // TODO: This could limit the transform, ie if we try to promote something
     // from an i8 and fail first, before trying an i16.
-    if (AllVisited.count(V))
+    if (!AllVisited.insert(V).second)
       return false;
 
     CurrentVisited.insert(V);
-    AllVisited.insert(V);
 
     // Calls can be both sources and sinks.
     if (isSink(V))
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 00c2823cee0af9..ec7af792efb067 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -113,11 +113,10 @@ static void dumpExpression(raw_ostream &OS, DIDumpOptions DumpOpts,
                            ArrayRef<uint8_t> Data, bool IsLittleEndian,
                            unsigned AddressSize, DWARFUnit *U) {
   DWARFDataExtractor Extractor(Data, IsLittleEndian, AddressSize);
-  // Note. We do not pass any format to DWARFExpression, even if the
-  // corresponding unit is known. For now, there is only one operation,
-  // DW_OP_call_ref, which depends on the format; it is rarely used, and
-  // is unexpected in location tables.
-  DWARFExpression(Extractor, AddressSize).print(OS, DumpOpts, U);
+  std::optional<dwarf::DwarfFormat> Format;
+  if (U)
+    Format = U->getFormat();
+  DWARFExpression(Extractor, AddressSize, Format).print(OS, DumpOpts, U);
 }
 
 bool DWARFLocationTable::dumpLocationList(
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index b90addbfba04af..2ae5ff3efc8c5f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -90,6 +90,8 @@ static std::vector<Desc> getOpDescriptions() {
   Descriptions[DW_OP_implicit_value] =
       Desc(Op::Dwarf4, Op::SizeLEB, Op::SizeBlock);
   Descriptions[DW_OP_stack_value] = Desc(Op::Dwarf4);
+  Descriptions[DW_OP_implicit_pointer] =
+      Desc(Op::Dwarf5, Op::SizeRefAddr, Op::SignedSizeLEB);
   Descriptions[DW_OP_addrx] = Desc(Op::Dwarf5, Op::SizeLEB);
   Descriptions[DW_OP_constx] = Desc(Op::Dwarf5, Op::SizeLEB);
   Descriptions[DW_OP_entry_value] = Desc(Op::Dwarf5, Op::SizeLEB);
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index fe4ad10a02d57d..b645888463b12a 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -67,7 +67,7 @@ AllocaInst *RandomIRBuilder::createStackMemory(Function *F, Type *Ty,
                                                Value *Init) {
   /// TODO: For all Allocas, maybe allocate an array.
   BasicBlock *EntryBB = &F->getEntryBlock();
-  DataLayout DL(F->getParent());
+  const DataLayout &DL = F->getDataLayout();
   AllocaInst *Alloca = new AllocaInst(Ty, DL.getAllocaAddrSpace(), "A",
                                       EntryBB->getFirstInsertionPt());
   if (Init)
@@ -423,7 +423,7 @@ Function *RandomIRBuilder::createFunctionDefinition(Module &M,
   // TODO: Some arguments and a return value would probably be more
   // interesting.
   LLVMContext &Context = M.getContext();
-  DataLayout DL(&M);
+  const DataLayout &DL = M.getDataLayout();
   BasicBlock *BB = BasicBlock::Create(Context, "BB", F);
   Type *RetTy = F->getReturnType();
   if (RetTy != Type::getVoidTy(Context)) {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 30b79b6d6f60fd..0db82cdd6373c8 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -509,7 +509,7 @@ DICompositeType *DIBuilder::createClassType(
          "createClassType should be called with a valid Context");
 
   auto *R = DICompositeType::get(
-      VMContext, dwarf::DW_TAG_structure_type, Name, File, LineNumber,
+      VMContext, dwarf::DW_TAG_class_type, Name, File, LineNumber,
       getNonCompileUnitScope(Context), DerivedFrom, SizeInBits, AlignInBits,
       OffsetInBits, Flags, Elements, RunTimeLang, VTableHolder,
       cast_or_null<MDTuple>(TemplateParams), UniqueIdentifier);
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 2aa3b672f06842..ae0ff9c5ffe914 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -22,7 +22,6 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
@@ -118,6 +117,27 @@ unsigned StructLayout::getElementContainingOffset(uint64_t FixedOffset) const {
   return SI - MemberOffsets.begin();
 }
 
+namespace {
+
+class StructLayoutMap {
+  using LayoutInfoTy = DenseMap<StructType *, StructLayout *>;
+  LayoutInfoTy LayoutInfo;
+
+public:
+  ~StructLayoutMap() {
+    // Remove any layouts.
+    for (const auto &I : LayoutInfo) {
+      StructLayout *Value = I.second;
+      Value->~StructLayout();
+      free(Value);
+    }
+  }
+
+  StructLayout *&operator[](StructType *STy) { return LayoutInfo[STy]; }
+};
+
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // LayoutAlignElem, LayoutAlign support
 //===----------------------------------------------------------------------===//
@@ -192,36 +212,31 @@ static const std::pair<AlignTypeEnum, LayoutAlignElem> DefaultAlignments[] = {
     {VECTOR_ALIGN, {128, Align(16), Align(16)}}, // v16i8, v8i16, v4i32, ...
 };
 
-void DataLayout::reset(StringRef Desc) {
-  clear();
-
-  LayoutMap = nullptr;
+DataLayout::DataLayout(StringRef LayoutString) {
   BigEndian = false;
   AllocaAddrSpace = 0;
-  StackNaturalAlign.reset();
   ProgramAddrSpace = 0;
   DefaultGlobalsAddrSpace = 0;
-  FunctionPtrAlign.reset();
   TheFunctionPtrAlignType = FunctionPtrAlignType::Independent;
   ManglingMode = MM_None;
-  NonIntegralAddressSpaces.clear();
   StructAlignment = LayoutAlignElem::get(Align(1), Align(8), 0);
 
   // Default alignments
   for (const auto &[Kind, Layout] : DefaultAlignments) {
     if (Error Err = setAlignment(Kind, Layout.ABIAlign, Layout.PrefAlign,
                                  Layout.TypeBitWidth))
-      return report_fatal_error(std::move(Err));
+      report_fatal_error(std::move(Err));
   }
   if (Error Err = setPointerAlignmentInBits(0, Align(8), Align(8), 64, 64))
-    return report_fatal_error(std::move(Err));
+    report_fatal_error(std::move(Err));
 
-  if (Error Err = parseSpecifier(Desc))
-    return report_fatal_error(std::move(Err));
+  if (Error Err = parseSpecifier(LayoutString))
+    report_fatal_error(std::move(Err));
 }
 
 DataLayout &DataLayout::operator=(const DataLayout &Other) {
-  clear();
+  delete static_cast<StructLayoutMap *>(LayoutMap);
+  LayoutMap = nullptr;
   StringRepresentation = Other.StringRepresentation;
   BigEndian = Other.BigEndian;
   AllocaAddrSpace = Other.AllocaAddrSpace;
@@ -589,12 +604,6 @@ Error DataLayout::parseSpecifier(StringRef Desc) {
   return Error::success();
 }
 
-DataLayout::DataLayout(const Module *M) {
-  init(M);
-}
-
-void DataLayout::init(const Module *M) { *this = M->getDataLayout(); }
-
 static SmallVectorImpl<LayoutAlignElem>::const_iterator
 findAlignmentLowerBound(const SmallVectorImpl<LayoutAlignElem> &Alignments,
                         uint32_t BitWidth) {
@@ -700,42 +709,7 @@ Align DataLayout::getIntegerAlignment(uint32_t BitWidth,
   return abi_or_pref ? I->ABIAlign : I->PrefAlign;
 }
 
-namespace {
-
-class StructLayoutMap {
-  using LayoutInfoTy = DenseMap<StructType*, StructLayout*>;
-  LayoutInfoTy LayoutInfo;
-
-public:
-  ~StructLayoutMap() {
-    // Remove any layouts.
-    for (const auto &I : LayoutInfo) {
-      StructLayout *Value = I.second;
-      Value->~StructLayout();
-      free(Value);
-    }
-  }
-
-  StructLayout *&operator[](StructType *STy) {
-    return LayoutInfo[STy];
-  }
-};
-
-} // end anonymous namespace
-
-void DataLayout::clear() {
-  LegalIntWidths.clear();
-  IntAlignments.clear();
-  FloatAlignments.clear();
-  VectorAlignments.clear();
-  Pointers.clear();
-  delete static_cast<StructLayoutMap *>(LayoutMap);
-  LayoutMap = nullptr;
-}
-
-DataLayout::~DataLayout() {
-  clear();
-}
+DataLayout::~DataLayout() { delete static_cast<StructLayoutMap *>(LayoutMap); }
 
 const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
   if (!LayoutMap)
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 80b5408b61eda0..4738ec7639f57a 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -388,9 +388,7 @@ void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
   setModuleFlag(Behavior, Key, ConstantInt::get(Int32Ty, Val));
 }
 
-void Module::setDataLayout(StringRef Desc) {
-  DL.reset(Desc);
-}
+void Module::setDataLayout(StringRef Desc) { DL = DataLayout(Desc); }
 
 void Module::setDataLayout(const DataLayout &Other) { DL = Other; }
 
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 6dadd9752646f2..f37e138edc36b1 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -1024,7 +1024,7 @@ void MCObjectFileInfo::initMCObjectFileInfo(MCContext &MCCtx, bool PIC,
   DwarfAccelNamespaceSection = nullptr; // Used only by selected targets.
   DwarfAccelTypesSection = nullptr;     // Used only by selected targets.
 
-  Triple TheTriple = Ctx->getTargetTriple();
+  const Triple &TheTriple = Ctx->getTargetTriple();
   switch (Ctx->getObjectFileType()) {
   case MCContext::IsMachO:
     initMachOMCObjectFileInfo(TheTriple);
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index ecc487a17cccae..e34a770b1b53e5 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -404,6 +404,22 @@ class CommandLineParser {
 
 static ManagedStatic<CommandLineParser> GlobalParser;
 
+template <typename T, T TrueVal, T FalseVal>
+static bool parseBool(Option &O, StringRef ArgName, StringRef Arg, T &Value) {
+  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
+      Arg == "1") {
+    Value = TrueVal;
+    return false;
+  }
+
+  if (Arg == "false" || Arg == "FALSE" || Arg == "False" || Arg == "0") {
+    Value = FalseVal;
+    return false;
+  }
+  return O.error("'" + Arg +
+                 "' is invalid value for boolean argument! Try 0 or 1");
+}
+
 void cl::AddLiteralOption(Option &O, StringRef Name) {
   GlobalParser->addLiteralOption(O, Name);
 }
@@ -1954,36 +1970,14 @@ void basic_parser_impl::printOptionName(const Option &O,
 //
 bool parser<bool>::parse(Option &O, StringRef ArgName, StringRef Arg,
                          bool &Value) {
-  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
-      Arg == "1") {
-    Value = true;
-    return false;
-  }
-
-  if (Arg == "false" || Arg == "FALSE" || Arg == "False" || Arg == "0") {
-    Value = false;
-    return false;
-  }
-  return O.error("'" + Arg +
-                 "' is invalid value for boolean argument! Try 0 or 1");
+  return parseBool<bool, true, false>(O, ArgName, Arg, Value);
 }
 
 // parser<boolOrDefault> implementation
 //
 bool parser<boolOrDefault>::parse(Option &O, StringRef ArgName, StringRef Arg,
                                   boolOrDefault &Value) {
-  if (Arg == "" || Arg == "true" || Arg == "TRUE" || Arg == "True" ||
-      Arg == "1") {
-    Value = BOU_TRUE;
-    return false;
-  }
-  if (Arg == "false" || Arg == "FALSE" || Arg == "False" || Arg == "0") {
-    Value = BOU_FALSE;
-    return false;
-  }
-
-  return O.error("'" + Arg +
-                 "' is invalid value for boolean argument! Try 0 or 1");
+  return parseBool<boolOrDefault, BOU_TRUE, BOU_FALSE>(O, ArgName, Arg, Value);
 }
 
 // parser<int> implementation
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 99d866a23a68ba..0554f0cf578831 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -32,18 +32,18 @@ using namespace llvm;
 namespace {
 // A list of supported preprocessing directives with their
 // internal token kinds and names.
-struct {
+struct PreprocessorDir {
   tgtok::TokKind Kind;
-  const char *Word;
-} PreprocessorDirs[] = {
-  { tgtok::Ifdef, "ifdef" },
-  { tgtok::Ifndef, "ifndef" },
-  { tgtok::Else, "else" },
-  { tgtok::Endif, "endif" },
-  { tgtok::Define, "define" }
+  StringRef Word;
 };
 } // end anonymous namespace
 
+constexpr PreprocessorDir PreprocessorDirs[] = {{tgtok::Ifdef, "ifdef"},
+                                                {tgtok::Ifndef, "ifndef"},
+                                                {tgtok::Else, "else"},
+                                                {tgtok::Endif, "endif"},
+                                                {tgtok::Define, "define"}};
+
 TGLexer::TGLexer(SourceMgr &SM, ArrayRef<std::string> Macros) : SrcMgr(SM) {
   CurBuffer = SrcMgr.getMainFileID();
   CurBuf = SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer();
@@ -641,54 +641,43 @@ bool TGLexer::prepExitInclude(bool IncludeStackMustBeEmpty) {
 }
 
 tgtok::TokKind TGLexer::prepIsDirective() const {
-  for (const auto &PD : PreprocessorDirs) {
-    int NextChar = *CurPtr;
-    bool Match = true;
-    unsigned I = 0;
-    for (; I < strlen(PD.Word); ++I) {
-      if (NextChar != PD.Word[I]) {
-        Match = false;
-        break;
-      }
-
-      NextChar = peekNextChar(I + 1);
-    }
+  for (const auto [Kind, Word] : PreprocessorDirs) {
+    if (StringRef(CurPtr, Word.size()) != Word)
+      continue;
+    int NextChar = peekNextChar(Word.size());
 
-    // Check for whitespace after the directive.  If there is no whitespace,
+    // Check for whitespace after the directive. If there is no whitespace,
     // then we do not recognize it as a preprocessing directive.
-    if (Match) {
-      tgtok::TokKind Kind = PD.Kind;
-
-      // New line and EOF may follow only #else/#endif.  It will be reported
-      // as an error for #ifdef/#define after the call to prepLexMacroName().
-      if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
-          NextChar == '\n' ||
-          // It looks like TableGen does not support '\r' as the actual
-          // carriage return, e.g. getNextChar() treats a single '\r'
-          // as '\n'.  So we do the same here.
-          NextChar == '\r')
-        return Kind;
 
-      // Allow comments after some directives, e.g.:
-      //     #else// OR #else/**/
-      //     #endif// OR #endif/**/
-      //
-      // Note that we do allow comments after #ifdef/#define here, e.g.
-      //     #ifdef/**/ AND #ifdef//
-      //     #define/**/ AND #define//
-      //
-      // These cases will be reported as incorrect after calling
-      // prepLexMacroName().  We could have supported C-style comments
-      // after #ifdef/#define, but this would complicate the code
-      // for little benefit.
-      if (NextChar == '/') {
-        NextChar = peekNextChar(I + 1);
+    // New line and EOF may follow only #else/#endif. It will be reported
+    // as an error for #ifdef/#define after the call to prepLexMacroName().
+    if (NextChar == ' ' || NextChar == '\t' || NextChar == EOF ||
+        NextChar == '\n' ||
+        // It looks like TableGen does not support '\r' as the actual
+        // carriage return, e.g. getNextChar() treats a single '\r'
+        // as '\n'.  So we do the same here.
+        NextChar == '\r')
+      return Kind;
 
-        if (NextChar == '*' || NextChar == '/')
-          return Kind;
+    // Allow comments after some directives, e.g.:
+    //     #else// OR #else/**/
+    //     #endif// OR #endif/**/
+    //
+    // Note that we do allow comments after #ifdef/#define here, e.g.
+    //     #ifdef/**/ AND #ifdef//
+    //     #define/**/ AND #define//
+    //
+    // These cases will be reported as incorrect after calling
+    // prepLexMacroName().  We could have supported C-style comments
+    // after #ifdef/#define, but this would complicate the code
+    // for little benefit.
+    if (NextChar == '/') {
+      NextChar = peekNextChar(Word.size() + 1);
+
+      if (NextChar == '*' || NextChar == '/')
+        return Kind;
 
-        // Pretend that we do not recognize the directive.
-      }
+      // Pretend that we do not recognize the directive.
     }
   }
 
@@ -698,10 +687,10 @@ tgtok::TokKind TGLexer::prepIsDirective() const {
 bool TGLexer::prepEatPreprocessorDirective(tgtok::TokKind Kind) {
   TokStart = CurPtr;
 
-  for (const auto &PD : PreprocessorDirs)
-    if (PD.Kind == Kind) {
+  for (const auto [PKind, PWord] : PreprocessorDirs)
+    if (PKind == Kind) {
       // Advance CurPtr to the end of the preprocessing word.
-      CurPtr += strlen(PD.Word);
+      CurPtr += PWord.size();
       return true;
     }
 
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 0391d518324315..b8f9b58a216446 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1310,6 +1310,13 @@ void AArch64AsmPrinter::emitGlobalAlias(const Module &M,
       StringRef ExpStr = cast<MDString>(Node->getOperand(0))->getString();
       MCSymbol *ExpSym = MMI->getContext().getOrCreateSymbol(ExpStr);
       MCSymbol *Sym = MMI->getContext().getOrCreateSymbol(GA.getName());
+
+      OutStreamer->beginCOFFSymbolDef(ExpSym);
+      OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
+      OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                      << COFF::SCT_COMPLEX_TYPE_SHIFT);
+      OutStreamer->endCOFFSymbolDef();
+
       OutStreamer->beginCOFFSymbolDef(Sym);
       OutStreamer->emitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
       OutStreamer->emitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index bf0eb1461e55e9..429affed63497d 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1394,6 +1394,18 @@ bool requiresGetVGCall(MachineFunction &MF) {
          !MF.getSubtarget<AArch64Subtarget>().hasSVE();
 }
 
+static bool requiresSaveVG(MachineFunction &MF) {
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  // For Darwin platforms we don't save VG for non-SVE functions, even if SME
+  // is enabled with streaming mode changes.
+  if (!AFI->hasStreamingModeChanges())
+    return false;
+  auto &ST = MF.getSubtarget<AArch64Subtarget>();
+  if (ST.isTargetDarwin())
+    return ST.hasSVE();
+  return true;
+}
+
 bool isVGInstruction(MachineBasicBlock::iterator MBBI) {
   unsigned Opc = MBBI->getOpcode();
   if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
@@ -1430,8 +1442,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
   // functions, we need to do this for both the streaming and non-streaming
   // vector length. Move past these instructions if necessary.
   MachineFunction &MF = *MBB.getParent();
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-  if (AFI->hasStreamingModeChanges())
+  if (requiresSaveVG(MF))
     while (isVGInstruction(MBBI))
       ++MBBI;
 
@@ -1938,7 +1949,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
          !IsSVECalleeSave(MBBI)) {
     // Move past instructions generated to calculate VG
-    if (AFI->hasStreamingModeChanges())
+    if (requiresSaveVG(MF))
       while (isVGInstruction(MBBI))
         ++MBBI;
 
@@ -3754,7 +3765,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   // non-streaming VG value.
   const Function &F = MF.getFunction();
   SMEAttrs Attrs(F);
-  if (AFI->hasStreamingModeChanges()) {
+  if (requiresSaveVG(MF)) {
     if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
       CSStackSize += 16;
     else
@@ -3907,7 +3918,7 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
   }
 
   // Insert VG into the list of CSRs, immediately before LR if saved.
-  if (AFI->hasStreamingModeChanges()) {
+  if (requiresSaveVG(MF)) {
     std::vector<CalleeSavedInfo> VGSaves;
     SMEAttrs Attrs(MF.getFunction());
 
@@ -4636,10 +4647,9 @@ MachineBasicBlock::iterator emitVGSaveRestore(MachineBasicBlock::iterator II,
 
 void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
     MachineFunction &MF, RegScavenger *RS = nullptr) const {
-  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   for (auto &BB : MF)
     for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();) {
-      if (AFI->hasStreamingModeChanges())
+      if (requiresSaveVG(MF))
         II = emitVGSaveRestore(II, this);
       if (StackTaggingMergeSetTag)
         II = tryMergeAdjacentSTG(II, this, RS);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index efc1703221d21e..314e7134dcd01a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1775,6 +1775,18 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                     MVT::v2f32, MVT::v4f32, MVT::v2f64})
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
 
+    // We can lower types that have <vscale x {2|4}> elements to compact.
+    for (auto VT :
+         {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
+          MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
+      setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+
+    // If we have SVE, we can use SVE logic for legal (or smaller than legal)
+    // NEON vectors in the lowest bits of the SVE register.
+    for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
+                    MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
+      setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom);
+
     // Histcnt is SVE2 only
     if (Subtarget->hasSVE2()) {
       setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other,
@@ -6619,6 +6631,104 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
   return DAG.getMergeValues({Ext, Chain}, DL);
 }
 
+SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Mask = Op.getOperand(1);
+  SDValue Passthru = Op.getOperand(2);
+  EVT VecVT = Vec.getValueType();
+  EVT MaskVT = Mask.getValueType();
+  EVT ElmtVT = VecVT.getVectorElementType();
+  const bool IsFixedLength = VecVT.isFixedLengthVector();
+  const bool HasPassthru = !Passthru.isUndef();
+  unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
+  EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
+
+  assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
+
+  if (!Subtarget->isSVEAvailable())
+    return SDValue();
+
+  if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
+    return SDValue();
+
+  // Only <vscale x {4|2} x {i32|i64}> supported for compact.
+  if (MinElmts != 2 && MinElmts != 4)
+    return SDValue();
+
+  // We can use the SVE register containing the NEON vector in its lowest bits.
+  if (IsFixedLength) {
+    EVT ScalableVecVT =
+        MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
+    EVT ScalableMaskVT = MVT::getScalableVectorVT(
+        MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
+
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+                      DAG.getUNDEF(ScalableVecVT), Vec,
+                      DAG.getConstant(0, DL, MVT::i64));
+    Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
+                       DAG.getUNDEF(ScalableMaskVT), Mask,
+                       DAG.getConstant(0, DL, MVT::i64));
+    Mask = DAG.getNode(ISD::TRUNCATE, DL,
+                       ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
+    Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
+                           DAG.getUNDEF(ScalableVecVT), Passthru,
+                           DAG.getConstant(0, DL, MVT::i64));
+
+    VecVT = Vec.getValueType();
+    MaskVT = Mask.getValueType();
+  }
+
+  // Get legal type for compact instruction
+  EVT ContainerVT = getSVEContainerType(VecVT);
+  EVT CastVT = VecVT.changeVectorElementTypeToInteger();
+
+  // Convert to i32 or i64 for smaller types, as these are the only supported
+  // sizes for compact.
+  if (ContainerVT != VecVT) {
+    Vec = DAG.getBitcast(CastVT, Vec);
+    Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
+  }
+
+  SDValue Compressed = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(),
+      DAG.getConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask, Vec);
+
+  // compact fills with 0s, so if our passthru is all 0s, do nothing here.
+  if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
+    SDValue Offset = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
+        DAG.getConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, Mask);
+
+    SDValue IndexMask = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
+        DAG.getConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
+        DAG.getConstant(0, DL, MVT::i64), Offset);
+
+    Compressed =
+        DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
+  }
+
+  // Extracting from a legal SVE type before truncating produces better code.
+  if (IsFixedLength) {
+    Compressed = DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, DL,
+        FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
+        Compressed, DAG.getConstant(0, DL, MVT::i64));
+    CastVT = FixedVecVT.changeVectorElementTypeToInteger();
+    VecVT = FixedVecVT;
+  }
+
+  // If we changed the element type before, we need to convert it back.
+  if (ContainerVT != VecVT) {
+    Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
+    Compressed = DAG.getBitcast(VecVT, Compressed);
+  }
+
+  return Compressed;
+}
+
 // Generate SUBS and CSEL for integer abs.
 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
@@ -6999,6 +7109,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::VSCALE:
     return LowerVSCALE(Op, DAG);
+  case ISD::VECTOR_COMPRESS:
+    return LowerVECTOR_COMPRESS(Op, DAG);
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
@@ -8762,10 +8874,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-
-    Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
-                        DAG.getVTList(MVT::Other, MVT::Glue), Chain);
-    InGlue = Chain.getValue(1);
+    if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
+      Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
+                          DAG.getVTList(MVT::Other, MVT::Glue), Chain);
+      InGlue = Chain.getValue(1);
+    }
 
     SDValue NewChain = changeStreamingMode(
         DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
@@ -8944,11 +9057,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Result = changeStreamingMode(
         DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
         getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
-    InGlue = Result.getValue(1);
 
-    Result =
-        DAG.getNode(AArch64ISD::VG_RESTORE, DL,
-                    DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
+    if (!Subtarget->isTargetDarwin() || Subtarget->hasSVE()) {
+      InGlue = Result.getValue(1);
+      Result =
+          DAG.getNode(AArch64ISD::VG_RESTORE, DL,
+                      DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
+    }
   }
 
   if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
@@ -26560,6 +26675,10 @@ void AArch64TargetLowering::ReplaceNodeResults(
   case ISD::VECREDUCE_UMIN:
     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
     return;
+  case ISD::VECTOR_COMPRESS:
+    if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG))
+      Results.push_back(Res);
+    return;
   case ISD::ADD:
   case ISD::FADD:
     ReplaceAddWithADDP(N, Results, DAG, Subtarget);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index ace682fb89379d..2fa9c49019326d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -1075,6 +1075,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerVECTOR_COMPRESS(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1b914b52ad2f89..bb05dc85d29a1f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10147,10 +10147,20 @@ let Predicates = [HasFP8] in {
   defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
 } // End let Predicates = [HasFP8]
 
-let Predicates = [HasFAMINMAX] in {
+let Predicates = [HasNEON, HasFAMINMAX] in {
  defm FAMAX : SIMDThreeSameVectorFP<0b0, 0b1, 0b011, "famax", null_frag>;
  defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", null_frag>;
-} // End let Predicates = [HasFAMAXMIN]
+
+ foreach Ty = [v4f16, v8f16, v2f32, v4f32, v2f64] in {
+  // Replace min(abs(a), abs(b)) with famin(a, b)
+  def : Pat<(Ty (fminimum (fabs Ty:$Rn), (fabs Ty:$Rm))),
+            (!cast<Instruction>("FAMIN"#Ty) Ty:$Rn, Ty:$Rm)>;
+
+  // Replace max(abs(a), abs(b)) with famax(a, b)
+  def : Pat<(Ty (fmaximum (fabs Ty:$Rn), (fabs Ty:$Rm))),
+            (!cast<Instruction>("FAMAX"#Ty) Ty:$Rn, Ty:$Rm)>;
+ }
+} // End let Predicates = [HasNEON, HasFAMINMAX]
 
 let Predicates = [HasFP8FMA] in {
  defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 3e1d1283dd485e..25e36dc4b3691f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1371,12 +1371,12 @@ bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
   // The function that we're calling cannot be vararg (only the intrinsic is).
   Info.IsVarArg = false;
 
-  assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
-                     [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
-         "SGPR arguments should be marked inreg");
-  assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
-                      [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
-         "VGPR arguments should not be marked inreg");
+  assert(
+      all_of(SGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
+      "SGPR arguments should be marked inreg");
+  assert(
+      none_of(VGPRArgs.Flags, [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
+      "VGPR arguments should not be marked inreg");
 
   SmallVector<ArgInfo, 8> OutArgs;
   splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 1ad2302a4f20c6..4e913d1b32e1f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -955,10 +955,9 @@ class MFMAExpInterleaveOpt final : public IGLPStrategy {
           return false;
       }
 
-      auto Reaches = (std::any_of(
-          Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *TargetSU) {
-            return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
-          }));
+      auto Reaches = any_of(*Cache, [&SU, &DAG](SUnit *TargetSU) {
+        return DAG->IsReachable(TargetSU, const_cast<SUnit *>(SU));
+      });
 
       return Reaches;
     }
@@ -1477,10 +1476,9 @@ bool MFMAExpInterleaveOpt::analyzeDAG(const SIInstrInfo *TII) {
   for (auto &MFMAPipeSU : MFMAPipeSUs) {
     if (is_contained(MFMAChainSeeds, MFMAPipeSU))
       continue;
-    if (!std::any_of(MFMAPipeSU->Preds.begin(), MFMAPipeSU->Preds.end(),
-                     [&TII](SDep &Succ) {
-                       return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
-                     })) {
+    if (none_of(MFMAPipeSU->Preds, [&TII](SDep &Succ) {
+          return TII->isMFMAorWMMA(*Succ.getSUnit()->getInstr());
+        })) {
       MFMAChainSeeds.push_back(MFMAPipeSU);
       ++MFMAChains;
     }
@@ -1939,14 +1937,10 @@ class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
         return true;
 
       // Does the previous VALU have this DS_Write as a successor
-      return (std::any_of(OtherGroup->Collection.begin(),
-                          OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
-                            return std::any_of(Elt->Succs.begin(),
-                                               Elt->Succs.end(),
-                                               [&SU](SDep &Succ) {
-                                                 return Succ.getSUnit() == SU;
-                                               });
-                          }));
+      return any_of(OtherGroup->Collection, [&SU](SUnit *Elt) {
+        return any_of(Elt->Succs,
+                      [&SU](SDep &Succ) { return Succ.getSUnit() == SU; });
+      });
     }
     IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
                       bool NeedsCache = false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index f35bb204cbbdb7..05ed1b322c0d1b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -161,18 +161,34 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 
         // TODO: Skip masking high bits if def is known boolean.
 
-        bool IsSGPR = TRI.isSGPRClass(SrcRC);
-        unsigned AndOpc =
-            IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
-        auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
-            .addImm(1)
-            .addReg(SrcReg);
-        if (IsSGPR)
-          And.setOperandDead(3); // Dead scc
-
-        BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
-            .addImm(0)
-            .addReg(MaskedReg);
+        if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
+          assert(Subtarget->useRealTrue16Insts());
+          const int64_t NoMods = 0;
+          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
+              .addImm(NoMods)
+              .addImm(1)
+              .addImm(NoMods)
+              .addReg(SrcReg)
+              .addImm(NoMods);
+          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
+              .addImm(NoMods)
+              .addImm(0)
+              .addImm(NoMods)
+              .addReg(MaskedReg)
+              .addImm(NoMods);
+        } else {
+          bool IsSGPR = TRI.isSGPRClass(SrcRC);
+          unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+          auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+                         .addImm(1)
+                         .addReg(SrcReg);
+          if (IsSGPR)
+            And.setOperandDead(3); // Dead scc
+
+          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
+              .addImm(0)
+              .addReg(MaskedReg);
+        }
       }
 
       if (!MRI->getRegClassOrNull(SrcReg))
@@ -2206,6 +2222,16 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     return false;
   }
 
+  if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
+    assert(STI.useRealTrue16Insts());
+    const DebugLoc &DL = I.getDebugLoc();
+    MachineBasicBlock *MBB = I.getParent();
+    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), DstReg)
+        .addReg(SrcReg, 0, AMDGPU::lo16);
+    I.eraseFromParent();
+    return true;
+  }
+
   if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
     MachineBasicBlock *MBB = I.getParent();
     const DebugLoc &DL = I.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 217279211531b4..d6958d9055fade 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1647,11 +1647,10 @@ void GCNScheduleDAGMILive::updateRegionBoundaries(
 }
 
 static bool hasIGLPInstrs(ScheduleDAGInstrs *DAG) {
-  return std::any_of(
-      DAG->begin(), DAG->end(), [](MachineBasicBlock::iterator MI) {
-        unsigned Opc = MI->getOpcode();
-        return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
-      });
+  return any_of(*DAG, [](MachineBasicBlock::iterator MI) {
+    unsigned Opc = MI->getOpcode();
+    return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
+  });
 }
 
 GCNPostScheduleDAGMILive::GCNPostScheduleDAGMILive(
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 32647d0b6563df..3be235378a07ec 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16693,6 +16693,9 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
   LLVMContext &Ctx = Builder.getContext();
 
+  // If the return value isn't used, do not introduce a false use in the phi.
+  bool ReturnValueIsUsed = !AI->use_empty();
+
   BasicBlock *BB = Builder.GetInsertBlock();
   Function *F = BB->getParent();
   BasicBlock *ExitBB =
@@ -16756,14 +16759,18 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
   Builder.CreateBr(PhiBB);
 
   Builder.SetInsertPoint(PhiBB);
-  PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
-  Loaded->addIncoming(LoadedShared, SharedBB);
-  Loaded->addIncoming(LoadedPrivate, PrivateBB);
-  Loaded->addIncoming(LoadedGlobal, GlobalBB);
+
+  if (ReturnValueIsUsed) {
+    PHINode *Loaded = Builder.CreatePHI(ValTy, 3);
+    Loaded->addIncoming(LoadedShared, SharedBB);
+    Loaded->addIncoming(LoadedPrivate, PrivateBB);
+    Loaded->addIncoming(LoadedGlobal, GlobalBB);
+    Loaded->takeName(AI);
+    AI->replaceAllUsesWith(Loaded);
+  }
+
   Builder.CreateBr(ExitBB);
 
-  Loaded->takeName(AI);
-  AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 12ebcf880d03a2..280054a6a1c16b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2077,6 +2077,8 @@ def : GCNPat <
 >;
 
 foreach fp16vt = [f16, bf16] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let SubtargetPredicate = p in {
 def : GCNPat <
   (fabs (fp16vt VGPR_32:$src)),
   (V_AND_B32_e64 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
@@ -2091,6 +2093,24 @@ def : GCNPat <
   (fneg (fabs (fp16vt VGPR_32:$src))),
   (V_OR_B32_e64 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
 >;
+}
+
+let SubtargetPredicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (fabs (fp16vt VGPR_16:$src)),
+  (V_AND_B16_t16_e64 (i32 0), (i16 0x7fff), (i32 0), VGPR_16:$src)
+>;
+
+def : GCNPat <
+  (fneg (fp16vt VGPR_16:$src)),
+  (V_XOR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src)
+>;
+
+def : GCNPat <
+  (fneg (fabs (fp16vt VGPR_16:$src))),
+  (V_OR_B16_t16_e64 (i32 0), (i16 0x8000), (i32 0), VGPR_16:$src) // Set sign bit
+>;
+} // End SubtargetPredicate = UseRealTrue16Insts
 } // End foreach fp16vt = ...
 
 def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 5d38cafd73dd95..6e945bf368f4eb 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -152,6 +152,10 @@ bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
       if (AMDGPU::VGPR_32RegClass.contains(Reg) &&
           !AMDGPU::VGPR_32_Lo128RegClass.contains(Reg))
         return false;
+
+      if (AMDGPU::VGPR_16RegClass.contains(Reg) &&
+          !AMDGPU::VGPR_16_Lo128RegClass.contains(Reg))
+        return false;
     }
   }
   return true;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 34d12aa5e07835..03e4cb9fcf49b7 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1397,7 +1397,8 @@ def : GCNPat <
 
 } // End OtherPredicates = [isGFX8Plus]
 
-let OtherPredicates = [isGFX8Plus] in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let OtherPredicates = [isGFX8Plus, p] in {
 def : GCNPat<
   (i32 (anyext i16:$src)),
   (COPY $src)
@@ -1420,7 +1421,43 @@ def : GCNPat <
   (EXTRACT_SUBREG $src, sub0)
 >;
 
-} // End OtherPredicates = [isGFX8Plus]
+} // End OtherPredicates = [isGFX8Plus, p]
+
+let OtherPredicates = [UseFakeTrue16Insts] in {
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<anyext> i16:$src)),
+  (COPY $src)
+>;
+} // End OtherPredicates = [UseFakeTrue16Insts]
+
+
+let OtherPredicates = [UseRealTrue16Insts] in {
+def : GCNPat<
+  (i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
+  (COPY $src)
+>;
+
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<anyext> i16:$src)),
+  (REG_SEQUENCE VGPR_32, $src, lo16, (i16 (IMPLICIT_DEF)), hi16)
+>;
+
+def : GCNPat<
+  (i64 (anyext i16:$src)),
+  (REG_SEQUENCE VReg_64, $src, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
+>;
+
+def : GCNPat<
+  (i16 (trunc i32:$src)),
+  (EXTRACT_SUBREG $src, lo16)
+>;
+
+def : GCNPat <
+  (i16 (trunc i64:$src)),
+  (EXTRACT_SUBREG $src, lo16)
+>;
+
+} // End OtherPredicates = [UseRealTrue16Insts]
 
 //===----------------------------------------------------------------------===//
 // GFX9
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index d17b4f24081312..fccaa27f361381 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -922,18 +922,25 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
   let HasSrc1FloatMods = 0;
   let Src1ModSDWA = Int16SDWAInputMods;
 }
-def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
+def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> {
+  let Src1RC32 = RegisterOperand<VGPR_16_Lo128>;
+  let Src1DPP = RegisterOperand<VGPR_16_Lo128>;
+  let Src1ModDPP = IntT16VRegInputMods<0/*IsFake16*/>;
+}
+def LDEXP_F16_VOPProfile_Fake16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
   let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
   let Src1DPP = RegisterOperand<VGPR_32_Lo128>;
-  let Src1ModDPP = IntT16VRegInputMods</* IsFake16= */ 1>;
+  let Src1ModDPP = IntT16VRegInputMods<1/*IsFake16*/>;
 }
 
 let isReMaterializable = 1 in {
 let FPDPRounding = 1 in {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in
     defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>;
-  let SubtargetPredicate = HasTrue16BitInsts in
+  let SubtargetPredicate = UseRealTrue16Insts in
     defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>;
+  let SubtargetPredicate = UseFakeTrue16Insts in
+    defm V_LDEXP_F16_fake16 : VOP2Inst <"v_ldexp_f16_fake16", LDEXP_F16_VOPProfile_Fake16, null_frag, "v_ldexp_f16_fake16">;
 } // End FPDPRounding = 1
 // FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
 defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
@@ -968,14 +975,30 @@ class LDEXP_F16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.
 let OtherPredicates = [NotHasTrue16BitInsts] in
 def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_e64>;
 
-let OtherPredicates = [HasTrue16BitInsts] in
-def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
+class LDEXP_F16_t16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.Pfl> : GCNPat <
+  (P.DstVT (op (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+               (i16 (VOP3Mods0 P.Src1VT:$src1, i32:$src1_modifiers)))),
+  (inst $src0_modifiers, $src0,
+        $src1_modifiers, $src1,
+        $clamp, /* clamp */
+        $omod, /* omod */
+        0) /* op_sel */
+>;
+
+let OtherPredicates = [UseRealTrue16Insts] in
+def : LDEXP_F16_t16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
+
+let OtherPredicates = [UseFakeTrue16Insts] in
+def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_fake16_e64>;
 
 let SubtargetPredicate = isGFX11Plus in {
   let isCommutable = 1 in {
-    defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, and>;
-    defm V_OR_B16_t16  : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, or>;
-    defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, xor>;
+    defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>;
+    defm V_AND_B16_fake16 : VOP2Inst_e64 <"v_and_b16_fake16", VOPProfile_Fake16<VOP_I16_I16_I16>, and>;
+    defm V_OR_B16_t16  : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, or>;
+    defm V_OR_B16_fake16  : VOP2Inst_e64 <"v_or_b16_fake16", VOPProfile_Fake16<VOP_I16_I16_I16>, or>;
+    defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, xor>;
+    defm V_XOR_B16_fake16 : VOP2Inst_e64 <"v_xor_b16_fake16", VOPProfile_Fake16<VOP_I16_I16_I16>, xor>;
   } // End isCommutable = 1
 } // End SubtargetPredicate = isGFX11Plus
 
@@ -1714,6 +1737,7 @@ defm V_MUL_F16_t16         : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
 defm V_MUL_F16_fake16      : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
 defm V_FMAC_F16_t16        : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
 defm V_LDEXP_F16_t16       : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
+defm V_LDEXP_F16_fake16    : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
 defm V_MAX_F16_t16         : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
 defm V_MAX_F16_fake16      : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
 defm V_MIN_F16_t16         : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index efa8e9c74d4495..6748eff9376b0d 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1227,8 +1227,11 @@ let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
   defm V_WRITELANE_B32     : VOP3_Real_No_Suffix_gfx11_gfx12<0x361>; // Pseudo in VOP2
 } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 defm V_AND_B16_t16         : VOP3Only_Realtriple_t16_gfx11_gfx12<0x362, "v_and_b16">;
+defm V_AND_B16_fake16      : VOP3Only_Realtriple_t16_gfx11_gfx12<0x362, "v_and_b16">;
 defm V_OR_B16_t16          : VOP3Only_Realtriple_t16_gfx11_gfx12<0x363, "v_or_b16">;
+defm V_OR_B16_fake16       : VOP3Only_Realtriple_t16_gfx11_gfx12<0x363, "v_or_b16">;
 defm V_XOR_B16_t16         : VOP3Only_Realtriple_t16_gfx11_gfx12<0x364, "v_xor_b16">;
+defm V_XOR_B16_fake16      : VOP3Only_Realtriple_t16_gfx11_gfx12<0x364, "v_xor_b16">;
 
 //===----------------------------------------------------------------------===//
 // GFX10.
diff --git a/llvm/lib/Target/Hexagon/HexagonMask.cpp b/llvm/lib/Target/Hexagon/HexagonMask.cpp
index ac9abfa6c12331..d31edc3d299ffe 100644
--- a/llvm/lib/Target/Hexagon/HexagonMask.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMask.cpp
@@ -90,6 +90,9 @@ bool HexagonMask::runOnMachineFunction(MachineFunction &MF) {
 
   if (!F.hasFnAttribute(Attribute::OptimizeForSize))
     return false;
+  // Mask instruction is available only from v66
+  if (!HST.hasV66Ops())
+    return false;
   // The mask instruction available in v66 can be used to generate values in
   // registers using 2 immediates Eg. to form 0x07fffffc in R0, you would write
   // "R0 = mask(#25,#2)" Since it is a single-word instruction, it takes less
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 773322937498dc..1205ad4c6b008f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -328,7 +328,7 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
                                     const NVPTXTargetLowering *TLI) {
   Function *Func = Arg->getParent();
   Type *StructType = Arg->getParamByValType();
-  const DataLayout DL(Func->getParent());
+  const DataLayout &DL = Func->getDataLayout();
 
   uint64_t NewArgAlign =
       TLI->getFunctionParamOptimizedAlign(Func, StructType, DL).value();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 3beaa5198259d0..6fde09d89e4839 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21024,6 +21024,11 @@ EVT RISCVTargetLowering::getOptimalMemOpType(const MemOp &Op,
     // which ends up using scalar sequences.
     return MVT::Other;
 
+  // If the minimum VLEN is less than RISCV::RVVBitsPerBlock we don't support
+  // fixed vectors.
+  if (MinVLenInBytes <= RISCV::RVVBitsPerBlock / 8)
+    return MVT::Other;
+
   // Prefer i8 for non-zero memset as it allows us to avoid materializing
   // a large scalar constant and instead use vmv.v.x/i to do the
   // broadcast.  For everything else, prefer ELenVT to minimize VL and thus
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2074fac8578914..04dfd0ea0d893a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -412,6 +412,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
     if (Subtarget.is64Bit()) {
+      setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
     }
@@ -3237,9 +3238,10 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
 }
 
 bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
-  // Speculate cttz only if we can directly use TZCNT or can promote to i32.
+  // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
   return Subtarget.hasBMI() ||
-         (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
+         (!Ty->isVectorTy() &&
+          Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
 }
 
 bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
diff --git a/llvm/lib/Target/X86/X86InstrAsmAlias.td b/llvm/lib/Target/X86/X86InstrAsmAlias.td
index 423ee0e8c9bda2..5a4c3f61672b3f 100644
--- a/llvm/lib/Target/X86/X86InstrAsmAlias.td
+++ b/llvm/lib/Target/X86/X86InstrAsmAlias.td
@@ -124,19 +124,31 @@ def : InstAlias<"ccmp"#Cond#"{q} $dcf\t{$src2, $src1|$src1, $src2}",
 defm : CCMP_Aliases<"o" ,  0>;
 defm : CCMP_Aliases<"no",  1>;
 defm : CCMP_Aliases<"b" ,  2>;
+defm : CCMP_Aliases<"c" ,  2>;
+defm : CCMP_Aliases<"nae", 2>;
+defm : CCMP_Aliases<"nb",  3>;
+defm : CCMP_Aliases<"nc",  3>;
 defm : CCMP_Aliases<"ae",  3>;
 defm : CCMP_Aliases<"e" ,  4>;
+defm : CCMP_Aliases<"z" ,  4>;
 defm : CCMP_Aliases<"ne",  5>;
+defm : CCMP_Aliases<"nz",  5>;
 defm : CCMP_Aliases<"be",  6>;
+defm : CCMP_Aliases<"na",  6>;
+defm : CCMP_Aliases<"nbe", 7>;
 defm : CCMP_Aliases<"a" ,  7>;
 defm : CCMP_Aliases<"s" ,  8>;
 defm : CCMP_Aliases<"ns",  9>;
 defm : CCMP_Aliases<"t" , 10>;
 defm : CCMP_Aliases<"f", 11>;
 defm : CCMP_Aliases<"l" , 12>;
+defm : CCMP_Aliases<"nge",12>;
+defm : CCMP_Aliases<"nl", 13>;
 defm : CCMP_Aliases<"ge", 13>;
 defm : CCMP_Aliases<"le", 14>;
+defm : CCMP_Aliases<"ng", 14>;
 defm : CCMP_Aliases<"g" , 15>;
+defm : CCMP_Aliases<"nle",15>;
 
 // CTEST Instructions Alias
 multiclass CTEST_Aliases<string Cond, int CC> {
@@ -186,19 +198,31 @@ def : InstAlias<"ctest"#Cond#"{q} $dcf\t{$src2, $src1|$src1, $src2}",
 defm : CTEST_Aliases<"o" ,  0>;
 defm : CTEST_Aliases<"no",  1>;
 defm : CTEST_Aliases<"b" ,  2>;
+defm : CTEST_Aliases<"c" ,  2>;
+defm : CTEST_Aliases<"nae", 2>;
+defm : CTEST_Aliases<"nb",  3>;
+defm : CTEST_Aliases<"nc",  3>;
 defm : CTEST_Aliases<"ae",  3>;
 defm : CTEST_Aliases<"e" ,  4>;
+defm : CTEST_Aliases<"z" ,  4>;
 defm : CTEST_Aliases<"ne",  5>;
+defm : CTEST_Aliases<"nz",  5>;
 defm : CTEST_Aliases<"be",  6>;
+defm : CTEST_Aliases<"na",  6>;
+defm : CTEST_Aliases<"nbe", 7>;
 defm : CTEST_Aliases<"a" ,  7>;
 defm : CTEST_Aliases<"s" ,  8>;
 defm : CTEST_Aliases<"ns",  9>;
 defm : CTEST_Aliases<"t" , 10>;
 defm : CTEST_Aliases<"f", 11>;
 defm : CTEST_Aliases<"l" , 12>;
+defm : CTEST_Aliases<"nge",12>;
+defm : CTEST_Aliases<"nl", 13>;
 defm : CTEST_Aliases<"ge", 13>;
 defm : CTEST_Aliases<"le", 14>;
+defm : CTEST_Aliases<"ng", 14>;
 defm : CTEST_Aliases<"g" , 15>;
+defm : CTEST_Aliases<"nle",15>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Mnemonic Aliases
@@ -208,13 +232,9 @@ defm : CMPCCXADD_Aliases<"o" ,  0>;
 defm : CMPCCXADD_Aliases<"no",  1>;
 defm : CMPCCXADD_Aliases<"b" ,  2>;
 defm : CMPCCXADD_Aliases<"ae",  3>;
-defm : CMPCCXADD_Aliases<"nb",  3>;
 defm : CMPCCXADD_Aliases<"e" ,  4>;
-defm : CMPCCXADD_Aliases<"z" ,  4>;
 defm : CMPCCXADD_Aliases<"ne",  5>;
-defm : CMPCCXADD_Aliases<"nz",  5>;
 defm : CMPCCXADD_Aliases<"be",  6>;
-defm : CMPCCXADD_Aliases<"nbe", 7>;
 defm : CMPCCXADD_Aliases<"a",   7>;
 defm : CMPCCXADD_Aliases<"s" ,  8>;
 defm : CMPCCXADD_Aliases<"ns",  9>;
@@ -222,10 +242,8 @@ defm : CMPCCXADD_Aliases<"p" , 10>;
 defm : CMPCCXADD_Aliases<"np", 11>;
 defm : CMPCCXADD_Aliases<"l" , 12>;
 defm : CMPCCXADD_Aliases<"ge", 13>;
-defm : CMPCCXADD_Aliases<"nl", 13>;
 defm : CMPCCXADD_Aliases<"le", 14>;
 defm : CMPCCXADD_Aliases<"g",  15>;
-defm : CMPCCXADD_Aliases<"nle",15>;
 
 
 def : MnemonicAlias<"call", "callw", "att">, Requires<[In16BitMode]>;
@@ -403,6 +421,7 @@ multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
 
 // Aliases for set<CC>
 defm : IntegerCondCodeMnemonicAlias<"set", "">;
+defm : IntegerCondCodeMnemonicAlias<"setzu", "">;
 // Aliases for j<CC>
 defm : IntegerCondCodeMnemonicAlias<"j", "">;
 // Aliases for cmov<CC>{w,l,q}
@@ -418,6 +437,9 @@ defm : IntegerCondCodeMnemonicAlias<"cfcmov", "l", "att">;
 defm : IntegerCondCodeMnemonicAlias<"cfcmov", "q", "att">;
 // No size suffix for intel-style asm.
 defm : IntegerCondCodeMnemonicAlias<"cfcmov", "", "intel">;
+
+// Aliases for cmp<CC>xadd
+defm : IntegerCondCodeMnemonicAlias<"cmp", "xadd", "">;
 //===----------------------------------------------------------------------===//
 // Assembler Instruction Aliases
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 5f9211edfa161b..4e5f2e3f872ad4 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -7797,6 +7797,11 @@ let Predicates = [HasAVX1Only] in {
             (VPSHUFDri (VMOV64toPQIrr GR64:$src), 0x44)>;
   def : Pat<(v2i64 (X86VBroadcastld64 addr:$src)),
             (VMOVDDUPrm addr:$src)>;
+
+  def : Pat<(v4i64 (X86VBroadcast v2i64:$src)),
+            (VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
+              (v2i64 (VPSHUFDri VR128:$src, 0x44)), sub_xmm),
+              (v2i64 (VPSHUFDri VR128:$src, 0x44)), 1)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 99ec50aa4775c8..452fff7898d0ea 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -371,7 +371,7 @@ doPromotion(Function *F, FunctionAnalysisManager &FAM,
     append_range(Worklist, Arg.users());
     while (!Worklist.empty()) {
       Value *V = Worklist.pop_back_val();
-      if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V)) {
+      if (isa<GetElementPtrInst>(V)) {
         DeadInsts.push_back(cast<Instruction>(V));
         append_range(Worklist, V->users());
         continue;
@@ -608,10 +608,6 @@ static bool findArgParts(Argument *Arg, const DataLayout &DL, AAResults &AAR,
   while (!Worklist.empty()) {
     const Use *U = Worklist.pop_back_val();
     Value *V = U->getUser();
-    if (isa<BitCastInst>(V)) {
-      AppendUses(V);
-      continue;
-    }
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(V)) {
       if (!GEP->hasAllConstantIndices())
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index db5e94806e9a16..8ece5bbdfc77e1 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -2327,8 +2327,8 @@ struct AANoFreeFloating : AANoFreeImpl {
             DepClassTy::REQUIRED, IsKnown);
       }
 
-      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
-          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+      if (isa<GetElementPtrInst>(UserI) || isa<PHINode>(UserI) ||
+          isa<SelectInst>(UserI)) {
         Follow = true;
         return true;
       }
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 2d7b7355229eaf..548335d750e33d 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -481,11 +481,6 @@ Constant *FunctionSpecializer::getPromotableAlloca(AllocaInst *Alloca,
     // the usage in the CallInst, which is what we check here.
     if (User == Call)
       continue;
-    if (auto *Bitcast = dyn_cast<BitCastInst>(User)) {
-      if (!Bitcast->hasOneUse() || *Bitcast->user_begin() != Call)
-        return nullptr;
-      continue;
-    }
 
     if (auto *Store = dyn_cast<StoreInst>(User)) {
       // This is a duplicate store, bail out.
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 5293a777496bc7..aae4926e027ff4 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1050,11 +1050,6 @@ valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI,
         continue; // Otherwise, storing through it, or storing into GV... fine.
       }
 
-      if (auto *BCI = dyn_cast<BitCastInst>(U)) {
-        Worklist.push_back(BCI);
-        continue;
-      }
-
       if (auto *GEPI = dyn_cast<GetElementPtrInst>(U)) {
         Worklist.push_back(GEPI);
         continue;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a38c990b9ea83a..4f9a5bd2c17f03 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -104,7 +104,7 @@ static Type *getPromotedType(Type *Ty) {
 /// requires a deeper change to allow either unread or unwritten objects.
 static bool hasUndefSource(AnyMemTransferInst *MI) {
   auto *Src = MI->getRawSource();
-  while (isa<GetElementPtrInst>(Src) || isa<BitCastInst>(Src)) {
+  while (isa<GetElementPtrInst>(Src)) {
     if (!Src->hasOneUse())
       return false;
     Src = cast<Instruction>(Src)->getOperand(0);
@@ -260,13 +260,11 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
 
   // memset(s,c,n) -> store s, c (for n=1,2,4,8)
   if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
-    Type *ITy = IntegerType::get(MI->getContext(), Len*8);  // n=1 -> i8.
-
     Value *Dest = MI->getDest();
 
     // Extract the fill value and store.
-    const uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
-    Constant *FillVal = ConstantInt::get(ITy, Fill);
+    Constant *FillVal = ConstantInt::get(
+        MI->getContext(), APInt::getSplat(Len * 8, FillC->getValue()));
     StoreInst *S = Builder.CreateStore(FillVal, Dest, MI->isVolatile());
     S->copyMetadata(*MI, LLVMContext::MD_DIAssignID);
     auto replaceOpForAssignmentMarkers = [FillC, FillVal](auto *DbgAssign) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 10a89b47e07537..2b0347073b7d1f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -312,7 +312,7 @@ Instruction *InstCombinerImpl::foldCmpLoadFromIndexedGlobal(
       DL.getTypeAllocSize(Init->getType()->getArrayElementType());
   auto MaskIdx = [&](Value *Idx) {
     if (!GEP->isInBounds() && llvm::countr_zero(ElementSize) != 0) {
-      Value *Mask = ConstantInt::get(Idx->getType(), -1);
+      Value *Mask = Constant::getAllOnesValue(Idx->getType());
       Mask = Builder.CreateLShr(Mask, llvm::countr_zero(ElementSize));
       Idx = Builder.CreateAnd(Idx, Mask);
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index e296626aa09bbd..df3d20c758a0ad 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -4707,8 +4707,7 @@ static bool SoleWriteToDeadLocal(Instruction *I, TargetLibraryInfo &TLI) {
   pushUsers(*AI);
   while (!AllocaUsers.empty()) {
     auto *UserI = cast<Instruction>(AllocaUsers.pop_back_val());
-    if (isa<BitCastInst>(UserI) || isa<GetElementPtrInst>(UserI) ||
-        isa<AddrSpaceCastInst>(UserI)) {
+    if (isa<GetElementPtrInst>(UserI) || isa<AddrSpaceCastInst>(UserI)) {
       pushUsers(*UserI);
       continue;
     }
diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index f3422a705dca7a..8555ef5c22f827 100644
--- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -208,6 +208,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
       continue;
 
     if (Instruction *K = dyn_cast<Instruction>(J))
+      if (K->getFunction() == ACall->getFunction())
         WorkList.push_back(K);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 3abf3aa5542c27..613597b0878814 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -357,19 +357,19 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   // Insert new integer induction variable.
   PHINode *NewPHI =
       PHINode::Create(Int32Ty, 2, PN->getName() + ".int", PN->getIterator());
-  NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+  NewPHI->addIncoming(ConstantInt::getSigned(Int32Ty, InitValue),
                       PN->getIncomingBlock(IncomingEdge));
   NewPHI->setDebugLoc(PN->getDebugLoc());
 
-  Instruction *NewAdd =
-      BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
-                                Incr->getName() + ".int", Incr->getIterator());
+  Instruction *NewAdd = BinaryOperator::CreateAdd(
+      NewPHI, ConstantInt::getSigned(Int32Ty, IncValue),
+      Incr->getName() + ".int", Incr->getIterator());
   NewAdd->setDebugLoc(Incr->getDebugLoc());
   NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
 
-  ICmpInst *NewCompare =
-      new ICmpInst(TheBr->getIterator(), NewPred, NewAdd,
-                   ConstantInt::get(Int32Ty, ExitValue), Compare->getName());
+  ICmpInst *NewCompare = new ICmpInst(
+      TheBr->getIterator(), NewPred, NewAdd,
+      ConstantInt::getSigned(Int32Ty, ExitValue), Compare->getName());
   NewCompare->setDebugLoc(Compare->getDebugLoc());
 
   // In the following deletions, PN may become dead and may be deleted.
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 8512b2accbe7c2..fe0e30d1965e05 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1729,7 +1729,9 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    SE.forgetLoopDispositions();
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks.
+    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
@@ -2023,7 +2025,9 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    SE.forgetLoopDispositions();
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks.
+    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index cee34f0a6da1f3..1d779128e454c1 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -980,12 +980,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   while (!srcUseList.empty()) {
     User *U = srcUseList.pop_back_val();
 
-    if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
-      append_range(srcUseList, U->users());
-      continue;
-    }
-    if (const auto *G = dyn_cast<GetElementPtrInst>(U);
-        G && G->hasAllZeroIndices()) {
+    if (isa<AddrSpaceCastInst>(U)) {
       append_range(srcUseList, U->users());
       continue;
     }
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index b177e048faae0f..0b3016a86e2875 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -143,9 +143,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
             GS.StoredType = GlobalStatus::Stored;
           }
         }
-      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
-                 isa<AddrSpaceCastInst>(I)) {
-        // Skip over bitcasts and GEPs; we don't care about the type or offset
+      } else if (isa<GetElementPtrInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        // Skip over GEPs; we don't care about the type or offset
         // of the pointer.
         if (analyzeGlobalAux(I, GS, VisitedUsers))
           return true;
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 9c9fc7a49a9d18..94e87656a192c7 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1896,8 +1896,8 @@ static void trackInlinedStores(Function::iterator Start, Function::iterator End,
   LLVM_DEBUG(errs() << "trackInlinedStores into "
                     << Start->getParent()->getName() << " from "
                     << CB.getCalledFunction()->getName() << "\n");
-  std::unique_ptr<DataLayout> DL = std::make_unique<DataLayout>(CB.getModule());
-  at::trackAssignments(Start, End, collectEscapedLocals(*DL, CB), *DL);
+  const DataLayout &DL = CB.getDataLayout();
+  at::trackAssignments(Start, End, collectEscapedLocals(DL, CB), DL);
 }
 
 /// Update inlined instructions' DIAssignID metadata. We need to do this
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index a0030db761c91a..a8ddbfe7e9c8d7 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7661,10 +7661,6 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
       }
     }
 
-    // Look through bitcasts.
-    if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
-      return passingValueIsAlwaysUndefined(V, BC, PtrValueMayBeModified);
-
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
       if (!LI->isVolatile())
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5547116133e8da..f5337b11edc977 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3139,12 +3139,10 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
     return WideningDecision != CM_GatherScatter;
   };
 
-  // A helper that returns true if the given value is a bitcast or
-  // getelementptr instruction contained in the loop.
-  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
-    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
-            isa<GetElementPtrInst>(V)) &&
-           !TheLoop->isLoopInvariant(V);
+  // A helper that returns true if the given value is a getelementptr
+  // instruction contained in the loop.
+  auto isLoopVaryingGEP = [&](Value *V) {
+    return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);
   };
 
   // A helper that evaluates a memory access's use of a pointer. If the use will
@@ -3154,7 +3152,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
     // We only care about bitcast and getelementptr instructions contained in
     // the loop.
-    if (!isLoopVaryingBitCastOrGEP(Ptr))
+    if (!isLoopVaryingGEP(Ptr))
       return;
 
     // If the pointer has already been identified as scalar (e.g., if it was
@@ -3220,7 +3218,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   unsigned Idx = 0;
   while (Idx != Worklist.size()) {
     Instruction *Dst = Worklist[Idx++];
-    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+    if (!isLoopVaryingGEP(Dst->getOperand(0)))
       continue;
     auto *Src = cast<Instruction>(Dst->getOperand(0));
     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
@@ -3705,7 +3703,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     auto *I = cast<Instruction>(V);
     auto UsersAreMemAccesses =
       llvm::all_of(I->users(), [&](User *U) -> bool {
-        return isVectorizedMemAccessUse(cast<Instruction>(U), V);
+        auto *UI = cast<Instruction>(U);
+        return TheLoop->contains(UI) && isVectorizedMemAccessUse(UI, V);
       });
     if (UsersAreMemAccesses)
       addToWorklistIfAllowed(I);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f30a76af832790..feffd9ae3c99b7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -268,6 +268,98 @@ static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
   Mask.swap(NewMask);
 }
 
+/// \returns the number of groups of shufflevector
+/// A group has the following features
+/// 1. All of value in a group are shufflevector.
+/// 2. The mask of all shufflevector is isExtractSubvectorMask.
+/// 3. The mask of all shufflevector uses all of the elements of the source (and
+/// the elements are used in order).
+/// e.g., it is 1 group (%0)
+/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
+///    <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
+///    <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+/// it is 2 groups (%3 and %4)
+/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
+///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
+///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
+///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
+///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+/// it is 0 group
+/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
+///     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
+///     <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
+  if (VL.empty())
+    return 0;
+  if (!all_of(VL, IsaPred<ShuffleVectorInst>))
+    return 0;
+  auto *SV = cast<ShuffleVectorInst>(VL.front());
+  unsigned SVNumElements =
+      cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
+  unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
+  if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
+    return 0;
+  unsigned NumGroup = 0;
+  for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
+    auto *SV = cast<ShuffleVectorInst>(VL[I]);
+    Value *Src = SV->getOperand(0);
+    ArrayRef<Value *> Group = VL.slice(I, GroupSize);
+    SmallVector<int> ExtractionIndex(SVNumElements);
+    if (!all_of(Group, [&](Value *V) {
+          auto *SV = cast<ShuffleVectorInst>(V);
+          // From the same source.
+          if (SV->getOperand(0) != Src)
+            return false;
+          int Index;
+          if (!SV->isExtractSubvectorMask(Index))
+            return false;
+          for (int I : seq<int>(Index, Index + SV->getShuffleMask().size()))
+            ExtractionIndex.push_back(I);
+          return true;
+        }))
+      return 0;
+    if (!is_sorted(ExtractionIndex))
+      return 0;
+    ++NumGroup;
+  }
+  assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
+  return NumGroup;
+}
+
+/// \returns a shufflevector mask which is used to vectorize shufflevectors
+/// e.g.,
+/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
+///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
+///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
+///    <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
+///    <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+/// the result is
+/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
+static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
+  assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
+  auto *SV = cast<ShuffleVectorInst>(VL.front());
+  unsigned SVNumElements =
+      cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
+  SmallVector<int> Mask;
+  unsigned AccumulateLength = 0;
+  for (Value *V : VL) {
+    auto *SV = cast<ShuffleVectorInst>(V);
+    for (int M : SV->getShuffleMask())
+      Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
+                                         : AccumulateLength + M);
+    AccumulateLength += SVNumElements;
+  }
+  return Mask;
+}
+
 /// \returns True if the value is a constant (but not globals/constant
 /// expressions).
 static bool isConstant(Value *V) {
@@ -6016,6 +6108,9 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
   DenseMap<Value *, SmallVector<StoreInst *>> PtrToStoresMap;
   for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
     Value *V = TE->Scalars[Lane];
+    // Don't iterate over the users of constant data.
+    if (isa<ConstantData>(V))
+      continue;
     // To save compilation time we don't visit if we have too many users.
     if (V->hasNUsesOrMore(UsesLimit))
       break;
@@ -6023,7 +6118,9 @@ BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
     // Collect stores per pointer object.
     for (User *U : V->users()) {
       auto *SI = dyn_cast<StoreInst>(U);
-      if (SI == nullptr || !SI->isSimple() ||
+      // Test whether we can handle the store. V might be a global, which could
+      // be used in a different function.
+      if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
           !isValidElementType(SI->getValueOperand()->getType()))
         continue;
       // Skip entry if already
@@ -6698,9 +6795,12 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     return TreeEntry::Vectorize;
   }
   case Instruction::ShuffleVector: {
-    // If this is not an alternate sequence of opcode like add-sub
-    // then do not vectorize this instruction.
     if (!S.isAltShuffle()) {
+      // REVEC can support non alternate shuffle.
+      if (SLPReVec && getShufflevectorNumGroups(VL))
+        return TreeEntry::Vectorize;
+      // If this is not an alternate sequence of opcode like add-sub
+      // then do not vectorize this instruction.
       LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
       return TreeEntry::NeedToGather;
     }
@@ -9427,8 +9527,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   // that the costs will be accurate.
   auto It = MinBWs.find(E);
   Type *OrigScalarTy = ScalarTy;
-  if (It != MinBWs.end())
+  if (It != MinBWs.end()) {
+    auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+    if (VecTy)
+      ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
+  }
   auto *VecTy = getWidenedType(ScalarTy, VL.size());
   unsigned EntryVF = E->getVectorFactor();
   auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
@@ -9773,16 +9877,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
     unsigned Opcode = ShuffleOrOp;
     unsigned VecOpcode = Opcode;
-    if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
+    if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
         (SrcIt != MinBWs.end() || It != MinBWs.end())) {
       // Check if the values are candidates to demote.
-      unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
+      unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
       if (SrcIt != MinBWs.end()) {
         SrcBWSz = SrcIt->second.first;
+        unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
         SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
-        SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
+        SrcVecTy =
+            getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
       }
-      unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+      unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
       if (BWSz == SrcBWSz) {
         VecOpcode = Instruction::BitCast;
       } else if (BWSz < SrcBWSz) {
@@ -10046,13 +10152,14 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
   case Instruction::ShuffleVector: {
-    assert(E->isAltShuffle() &&
-           ((Instruction::isBinaryOp(E->getOpcode()) &&
-             Instruction::isBinaryOp(E->getAltOpcode())) ||
-            (Instruction::isCast(E->getOpcode()) &&
-             Instruction::isCast(E->getAltOpcode())) ||
-            (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
-           "Invalid Shuffle Vector Operand");
+    if (!SLPReVec || E->isAltShuffle())
+      assert(E->isAltShuffle() &&
+             ((Instruction::isBinaryOp(E->getOpcode()) &&
+               Instruction::isBinaryOp(E->getAltOpcode())) ||
+              (Instruction::isCast(E->getOpcode()) &&
+               Instruction::isCast(E->getAltOpcode())) ||
+              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+             "Invalid Shuffle Vector Operand");
     // Try to find the previous shuffle node with the same operands and same
     // main/alternate ops.
     auto TryFindNodeWithEqualOperands = [=]() {
@@ -10160,6 +10267,13 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       // TODO: Check the reverse order too.
       return VecCost;
     };
+    if (SLPReVec && !E->isAltShuffle())
+      return GetCostDiff(GetScalarCost, [](InstructionCost) {
+        // shufflevector will be eliminated by instcombine because the
+        // shufflevector masks are used in order (guaranteed by
+        // getShufflevectorNumGroups). The vector cost is 0.
+        return TTI::TCC_Free;
+      });
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
   case Instruction::Freeze:
@@ -12478,8 +12592,15 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
     }
     if (IsSameVE) {
       auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
+        // V may be affected by MinBWs.
+        // We want ShuffleInstructionBuilder to correctly support REVEC. The key
+        // factor is the number of elements, not their type.
+        Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
+        unsigned NumElements = getNumElements(VL.front()->getType());
         ShuffleInstructionBuilder ShuffleBuilder(
-            cast<VectorType>(V->getType())->getElementType(), Builder, *this);
+            NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
+                             : ScalarTy,
+            Builder, *this);
         ShuffleBuilder.add(V, Mask);
         return ShuffleBuilder.finalize(std::nullopt);
       };
@@ -13012,8 +13133,12 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
   else if (auto *IE = dyn_cast<InsertElementInst>(V))
     ScalarTy = IE->getOperand(1)->getType();
   auto It = MinBWs.find(E);
-  if (It != MinBWs.end())
+  if (It != MinBWs.end()) {
+    auto VecTy = dyn_cast<FixedVectorType>(ScalarTy);
     ScalarTy = IntegerType::get(F->getContext(), It->second.first);
+    if (VecTy)
+      ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
+  }
   auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
   if (E->isGather()) {
     // Set insert point for non-reduction initial nodes.
@@ -13329,14 +13454,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       Instruction::CastOps VecOpcode = CI->getOpcode();
       Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
       auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
-      if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
+      if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
           (SrcIt != MinBWs.end() || It != MinBWs.end() ||
-           SrcScalarTy != CI->getOperand(0)->getType())) {
+           SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
         // Check if the values are candidates to demote.
         unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
         if (SrcIt != MinBWs.end())
           SrcBWSz = SrcIt->second.first;
-        unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+        unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
         if (BWSz == SrcBWSz) {
           VecOpcode = Instruction::BitCast;
         } else if (BWSz < SrcBWSz) {
@@ -13809,128 +13934,151 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
       return V;
     }
     case Instruction::ShuffleVector: {
-      assert(E->isAltShuffle() &&
-             ((Instruction::isBinaryOp(E->getOpcode()) &&
-               Instruction::isBinaryOp(E->getAltOpcode())) ||
-              (Instruction::isCast(E->getOpcode()) &&
-               Instruction::isCast(E->getAltOpcode())) ||
-              (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
-             "Invalid Shuffle Vector Operand");
-
-      Value *LHS = nullptr, *RHS = nullptr;
-      if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
+      Value *V;
+      if (SLPReVec && !E->isAltShuffle()) {
+        assert(E->ReuseShuffleIndices.empty() &&
+               "Not support ReuseShuffleIndices yet.");
+        assert(E->ReorderIndices.empty() && "Not support ReorderIndices yet.");
         setInsertPointAfterBundle(E);
-        LHS = vectorizeOperand(E, 0, PostponedPHIs);
+        Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
         if (E->VectorizedValue) {
           LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
           return E->VectorizedValue;
         }
-        RHS = vectorizeOperand(E, 1, PostponedPHIs);
-      } else {
-        setInsertPointAfterBundle(E);
-        LHS = vectorizeOperand(E, 0, PostponedPHIs);
-      }
-      if (E->VectorizedValue) {
-        LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
-        return E->VectorizedValue;
-      }
-      if (LHS && RHS &&
-          ((Instruction::isBinaryOp(E->getOpcode()) &&
-            (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
-           (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
-        assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
-                getOperandEntry(E, 1)->isGather() ||
-                MinBWs.contains(getOperandEntry(E, 0)) ||
-                MinBWs.contains(getOperandEntry(E, 1))) &&
-               "Expected item in MinBWs.");
-        Type *CastTy = VecTy;
-        if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
-          if (cast<VectorType>(LHS->getType())
-                  ->getElementType()
-                  ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
-                                               ->getElementType()
-                                               ->getIntegerBitWidth())
-            CastTy = RHS->getType();
-          else
-            CastTy = LHS->getType();
-        }
-        if (LHS->getType() != CastTy)
-          LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
-        if (RHS->getType() != CastTy)
-          RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
-      }
-
-      Value *V0, *V1;
-      if (Instruction::isBinaryOp(E->getOpcode())) {
-        V0 = Builder.CreateBinOp(
-            static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
-        V1 = Builder.CreateBinOp(
-            static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
-      } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
-        V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
-        auto *AltCI = cast<CmpInst>(E->getAltOp());
-        CmpInst::Predicate AltPred = AltCI->getPredicate();
-        V1 = Builder.CreateCmp(AltPred, LHS, RHS);
+        // The current shufflevector usage always duplicate the source.
+        V = Builder.CreateShuffleVector(Src,
+                                        calculateShufflevectorMask(E->Scalars));
+        propagateIRFlags(V, E->Scalars, VL0);
       } else {
-        if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
-          unsigned SrcBWSz = DL->getTypeSizeInBits(
-              cast<VectorType>(LHS->getType())->getElementType());
-          unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
-          if (BWSz <= SrcBWSz) {
-            if (BWSz < SrcBWSz)
-              LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
-            assert(LHS->getType() == VecTy && "Expected same type as operand.");
-            if (auto *I = dyn_cast<Instruction>(LHS))
-              LHS = propagateMetadata(I, E->Scalars);
-            E->VectorizedValue = LHS;
-            ++NumVectorInstructions;
-            return LHS;
+        assert(E->isAltShuffle() &&
+               ((Instruction::isBinaryOp(E->getOpcode()) &&
+                 Instruction::isBinaryOp(E->getAltOpcode())) ||
+                (Instruction::isCast(E->getOpcode()) &&
+                 Instruction::isCast(E->getAltOpcode())) ||
+                (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
+               "Invalid Shuffle Vector Operand");
+
+        Value *LHS = nullptr, *RHS = nullptr;
+        if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
+          setInsertPointAfterBundle(E);
+          LHS = vectorizeOperand(E, 0, PostponedPHIs);
+          if (E->VectorizedValue) {
+            LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+            return E->VectorizedValue;
           }
+          RHS = vectorizeOperand(E, 1, PostponedPHIs);
+        } else {
+          setInsertPointAfterBundle(E);
+          LHS = vectorizeOperand(E, 0, PostponedPHIs);
         }
-        V0 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
-        V1 = Builder.CreateCast(
-            static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
-      }
-      // Add V0 and V1 to later analysis to try to find and remove matching
-      // instruction, if any.
-      for (Value *V : {V0, V1}) {
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          GatherShuffleExtractSeq.insert(I);
-          CSEBlocks.insert(I->getParent());
+        if (E->VectorizedValue) {
+          LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+          return E->VectorizedValue;
+        }
+        if (LHS && RHS &&
+            ((Instruction::isBinaryOp(E->getOpcode()) &&
+              (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
+             (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
+          assert((It != MinBWs.end() ||
+                  getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
+                  getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
+                  MinBWs.contains(getOperandEntry(E, 0)) ||
+                  MinBWs.contains(getOperandEntry(E, 1))) &&
+                 "Expected item in MinBWs.");
+          Type *CastTy = VecTy;
+          if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
+            if (cast<VectorType>(LHS->getType())
+                    ->getElementType()
+                    ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
+                                                 ->getElementType()
+                                                 ->getIntegerBitWidth())
+              CastTy = RHS->getType();
+            else
+              CastTy = LHS->getType();
+          }
+          if (LHS->getType() != CastTy)
+            LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
+          if (RHS->getType() != CastTy)
+            RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
         }
-      }
 
-      // Create shuffle to take alternate operations from the vector.
-      // Also, gather up main and alt scalar ops to propagate IR flags to
-      // each vector operation.
-      ValueList OpScalars, AltScalars;
-      SmallVector<int> Mask;
-      E->buildAltOpShuffleMask(
-          [E, this](Instruction *I) {
-            assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-            return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
-                                          *TLI);
-          },
-          Mask, &OpScalars, &AltScalars);
+        Value *V0, *V1;
+        if (Instruction::isBinaryOp(E->getOpcode())) {
+          V0 = Builder.CreateBinOp(
+              static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+          V1 = Builder.CreateBinOp(
+              static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+        } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
+          V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
+          auto *AltCI = cast<CmpInst>(E->getAltOp());
+          CmpInst::Predicate AltPred = AltCI->getPredicate();
+          V1 = Builder.CreateCmp(AltPred, LHS, RHS);
+        } else {
+          if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
+            unsigned SrcBWSz = DL->getTypeSizeInBits(
+                cast<VectorType>(LHS->getType())->getElementType());
+            unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
+            if (BWSz <= SrcBWSz) {
+              if (BWSz < SrcBWSz)
+                LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
+              assert(LHS->getType() == VecTy &&
+                     "Expected same type as operand.");
+              if (auto *I = dyn_cast<Instruction>(LHS))
+                LHS = propagateMetadata(I, E->Scalars);
+              E->VectorizedValue = LHS;
+              ++NumVectorInstructions;
+              return LHS;
+            }
+          }
+          V0 = Builder.CreateCast(
+              static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
+          V1 = Builder.CreateCast(
+              static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
+        }
+        // Add V0 and V1 to later analysis to try to find and remove matching
+        // instruction, if any.
+        for (Value *V : {V0, V1}) {
+          if (auto *I = dyn_cast<Instruction>(V)) {
+            GatherShuffleExtractSeq.insert(I);
+            CSEBlocks.insert(I->getParent());
+          }
+        }
 
-      propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
-      propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
-      auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
-        // Drop nuw flags for abs(sub(commutative), true).
-        if (auto *I = dyn_cast<Instruction>(Vec);
-            I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
-            any_of(E->Scalars, [](Value *V) {
-              auto *IV = cast<Instruction>(V);
-              return IV->getOpcode() == Instruction::Sub &&
-                     isCommutative(cast<Instruction>(IV));
-            }))
-          I->setHasNoUnsignedWrap(/*b=*/false);
-      };
-      DropNuwFlag(V0, E->getOpcode());
-      DropNuwFlag(V1, E->getAltOpcode());
+        // Create shuffle to take alternate operations from the vector.
+        // Also, gather up main and alt scalar ops to propagate IR flags to
+        // each vector operation.
+        ValueList OpScalars, AltScalars;
+        SmallVector<int> Mask;
+        E->buildAltOpShuffleMask(
+            [E, this](Instruction *I) {
+              assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+              return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
+                                            *TLI);
+            },
+            Mask, &OpScalars, &AltScalars);
+
+        propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
+        propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
+        auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
+          // Drop nuw flags for abs(sub(commutative), true).
+          if (auto *I = dyn_cast<Instruction>(Vec);
+              I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
+              any_of(E->Scalars, [](Value *V) {
+                auto *IV = cast<Instruction>(V);
+                return IV->getOpcode() == Instruction::Sub &&
+                       isCommutative(cast<Instruction>(IV));
+              }))
+            I->setHasNoUnsignedWrap(/*b=*/false);
+        };
+        DropNuwFlag(V0, E->getOpcode());
+        DropNuwFlag(V1, E->getAltOpcode());
 
-      Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
+        if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+          assert(SLPReVec && "FixedVectorType is not expected.");
+          transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);
+        }
+        V = Builder.CreateShuffleVector(V0, V1, Mask);
+      }
       if (auto *I = dyn_cast<Instruction>(V)) {
         V = propagateMetadata(I, E->Scalars);
         GatherShuffleExtractSeq.insert(I);
@@ -15433,7 +15581,8 @@ bool BoUpSLP::collectValuesToDemote(
   if (all_of(E.Scalars, IsaPred<Constant>))
     return true;
 
-  unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
+  unsigned OrigBitWidth =
+      DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
   if (OrigBitWidth == BitWidth) {
     MaxDepthLevel = 1;
     return true;
@@ -15864,7 +16013,9 @@ void BoUpSLP::computeMinimumValueSizes() {
     }
 
     unsigned VF = E.getVectorFactor();
-    auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
+    Type *ScalarTy = E.Scalars.front()->getType();
+    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+    auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
     if (!TreeRootIT || !Opcode)
       return 0u;
 
@@ -15872,7 +16023,8 @@ void BoUpSLP::computeMinimumValueSizes() {
                [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
       return 0u;
 
-    unsigned NumParts = TTI->getNumberOfParts(getWidenedType(TreeRootIT, VF));
+    unsigned NumParts = TTI->getNumberOfParts(
+        getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
 
     // The maximum bit width required to represent all the values that can be
     // demoted without loss of precision. It would be safe to truncate the roots
@@ -15894,7 +16046,8 @@ void BoUpSLP::computeMinimumValueSizes() {
     // we can truncate the roots to this narrower type.
     for (Value *Root : E.Scalars) {
       unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
-      TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
+      TypeSize NumTypeBits =
+          DL->getTypeSizeInBits(Root->getType()->getScalarType());
       unsigned BitWidth1 = NumTypeBits - NumSignBits;
       // If we can't prove that the sign bit is zero, we must add one to the
       // maximum bit width to account for the unknown sign bit. This preserves
@@ -16014,7 +16167,8 @@ void BoUpSLP::computeMinimumValueSizes() {
 
     for (unsigned Idx : RootDemotes) {
       if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
-            uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
+            uint32_t OrigBitWidth =
+                DL->getTypeSizeInBits(V->getType()->getScalarType());
             if (OrigBitWidth > MaxBitWidth) {
               APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
               return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
@@ -16065,7 +16219,8 @@ void BoUpSLP::computeMinimumValueSizes() {
     // type, we can proceed with the narrowing. Otherwise, do nothing.
     if (MaxBitWidth == 0 ||
         MaxBitWidth >=
-            cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
+            cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
+                ->getBitWidth()) {
       if (UserIgnoreList)
         AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
       continue;
@@ -17820,8 +17975,37 @@ class HorizontalReduction {
                                          SameValuesCounter, TrackedToOrig);
         }
 
-        Value *ReducedSubTree =
-            emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+        Value *ReducedSubTree;
+        Type *ScalarTy = VL.front()->getType();
+        if (isa<FixedVectorType>(ScalarTy)) {
+          assert(SLPReVec && "FixedVectorType is not expected.");
+          unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+          ReducedSubTree = PoisonValue::get(FixedVectorType::get(
+              VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
+          for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
+            // Do reduction for each lane.
+            // e.g., do reduce add for
+            // VL[0] = <4 x Ty> <a, b, c, d>
+            // VL[1] = <4 x Ty> <e, f, g, h>
+            // Lane[0] = <2 x Ty> <a, e>
+            // Lane[1] = <2 x Ty> <b, f>
+            // Lane[2] = <2 x Ty> <c, g>
+            // Lane[3] = <2 x Ty> <d, h>
+            // result[0] = reduce add Lane[0]
+            // result[1] = reduce add Lane[1]
+            // result[2] = reduce add Lane[2]
+            // result[3] = reduce add Lane[3]
+            SmallVector<int, 16> Mask =
+                createStrideMask(I, ScalarTyNumElements, VL.size());
+            Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
+            ReducedSubTree = Builder.CreateInsertElement(
+                ReducedSubTree, emitReduction(Lane, Builder, ReduxWidth, TTI),
+                I);
+          }
+        } else {
+          ReducedSubTree =
+              emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+        }
         if (ReducedSubTree->getType() != VL.front()->getType()) {
           assert(ReducedSubTree->getType() != VL.front()->getType() &&
                  "Expected different reduction type.");
@@ -18049,9 +18233,25 @@ class HorizontalReduction {
     case RecurKind::FAdd:
     case RecurKind::FMul: {
       unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
-      if (!AllConsts)
-        VectorCost =
-            TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
+      if (!AllConsts) {
+        if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+          assert(SLPReVec && "FixedVectorType is not expected.");
+          unsigned ScalarTyNumElements = VecTy->getNumElements();
+          for (unsigned I : seq<unsigned>(ReducedVals.size())) {
+            VectorCost += TTI->getShuffleCost(
+                TTI::SK_PermuteSingleSrc, VectorTy,
+                createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
+            VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
+                                                          CostKind);
+          }
+          VectorCost += TTI->getScalarizationOverhead(
+              VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
+              /*Extract*/ false, TTI::TCK_RecipThroughput);
+        } else {
+          VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF,
+                                                       CostKind);
+        }
+      }
       ScalarCost = EvaluateScalarCost([&]() {
         return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
       });
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1a93f275a39f5f..911b2fe9e9a1eb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1652,24 +1652,25 @@ void VPBlendRecipe::execute(VPTransformState &State) {
   //                      In0)))
   // Note that Mask0 is never used: lanes for which no path reaches this phi and
   // are essentially undef are taken from In0.
- VectorParts Entry(State.UF);
- bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
- for (unsigned In = 0; In < NumIncoming; ++In) {
-   for (unsigned Part = 0; Part < State.UF; ++Part) {
-     // We might have single edge PHIs (blocks) - use an identity
-     // 'select' for the first PHI operand.
-     Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
-     if (In == 0)
-       Entry[Part] = In0; // Initialize with the first incoming value.
-     else {
-       // Select between the current value and the previous incoming edge
-       // based on the incoming mask.
-       Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
-       Entry[Part] =
-           State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
-     }
-   }
- }
+  VectorParts Entry(State.UF);
+  bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+  for (unsigned In = 0; In < NumIncoming; ++In) {
+    for (unsigned Part = 0; Part < State.UF; ++Part) {
+      // We might have single edge PHIs (blocks) - use an identity
+      // 'select' for the first PHI operand.
+      Value *In0 = State.get(getIncomingValue(In), Part, OnlyFirstLaneUsed);
+      if (In == 0)
+        Entry[Part] = In0; // Initialize with the first incoming value.
+      else {
+        // Select between the current value and the previous incoming edge
+        // based on the incoming mask.
+        Value *Cond = State.get(getMask(In), Part, OnlyFirstLaneUsed);
+        Entry[Part] =
+            State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+      }
+    }
+  }
+
   for (unsigned Part = 0; Part < State.UF; ++Part)
     State.set(this, Entry[Part], Part, OnlyFirstLaneUsed);
 }
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 25b169e08a7231..a485bad2a477e9 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -39,13 +39,9 @@ define i64 @var_cttz_i64u(i64 %a) {
 }
 
 define i32 @var_cttz_i32(i32 %a) {
-; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
-;
-; BMI-LABEL: 'var_cttz_i32'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
+; CHECK-LABEL: 'var_cttz_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
   %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 0)
   ret i32 %cttz
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-latency.ll b/llvm/test/Analysis/CostModel/X86/cttz-latency.ll
index a8abba56ba49fc..066e3232612f25 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-latency.ll
@@ -39,13 +39,9 @@ define i64 @var_cttz_i64u(i64 %a) {
 }
 
 define i32 @var_cttz_i32(i32 %a) {
-; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
-;
-; BMI-LABEL: 'var_cttz_i32'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
+; CHECK-LABEL: 'var_cttz_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
   %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 0)
   ret i32 %cttz
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index 294f101f571bf2..cc71bb5c908839 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -39,13 +39,9 @@ define i64 @var_cttz_i64u(i64 %a) {
 }
 
 define i32 @var_cttz_i32(i32 %a) {
-; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
-;
-; BMI-LABEL: 'var_cttz_i32'
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; BMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
+; CHECK-LABEL: 'var_cttz_i32'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
   %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 0)
   ret i32 %cttz
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 88a22f98feb256..1de3e2a853dd86 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -237,17 +237,17 @@ define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'cttz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'cttz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll b/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll
index 4d5f08d41b9cee..359e22fa41bacd 100644
--- a/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll
+++ b/llvm/test/Analysis/ScalarEvolution/different-loops-recs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -disable-output "-passes=print<scalar-evolution>" < %s 2>&1 | FileCheck %s
 
 ; This test set ensures that we can correctly operate with recurrencies from
@@ -7,28 +8,62 @@
 ; order.
 
 define void @test_00() {
-
-; CHECK-LABEL: Classifying expressions for: @test_00
-; CHECK:       %sum1 = add i32 %phi1, %phi2
-; CHECK-NEXT:  -->  {14,+,3}<%loop1>
-; CHECK:       %sum2 = add i32 %sum1, %phi3
-; CHECK-NEXT:  -->  {20,+,6}<%loop1>
-; CHECK:       %sum3 = add i32 %phi4, %phi5
-; CHECK-NEXT:  -->  {116,+,3}<%loop2>
-; CHECK:       %sum4 = add i32 %sum3, %phi6
-; CHECK-NEXT:  -->  {159,+,6}<%loop2>
-; CHECK:       %s1 = add i32 %phi1, %phi4
-; CHECK-NEXT:  -->  {{\{\{}}73,+,1}<nuw><nsw><%loop1>,+,1}<nw><%loop2>
-; CHECK:       %s2 = add i32 %phi5, %phi2
-; CHECK-NEXT:  -->  {{\{\{}}57,+,2}<nuw><nsw><%loop1>,+,2}<nw><%loop2>
-; CHECK:       %s3 = add i32 %sum1, %sum3
-; CHECK-NEXT:  -->  {{\{\{}}130,+,3}<%loop1>,+,3}<%loop2>
-; CHECK:       %s4 = add i32 %sum4, %sum2
-; CHECK-NEXT:  -->  {{\{\{}}179,+,6}<%loop1>,+,6}<%loop2>
-; CHECK:       %s5 = add i32 %phi3, %sum3
-; CHECK-NEXT:  -->  {{\{\{}}122,+,3}<nuw><nsw><%loop1>,+,3}<%loop2>
-; CHECK:       %s6 = add i32 %sum2, %phi6
-; CHECK-NEXT:  -->  {{\{\{}}63,+,6}<%loop1>,+,3}<nw><%loop2>
+; CHECK-LABEL: 'test_00'
+; CHECK-NEXT:  Classifying expressions for: @test_00
+; CHECK-NEXT:    %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1 ]
+; CHECK-NEXT:    --> {10,+,1}<nuw><nsw><%loop1> U: [10,175) S: [10,175) Exits: 174 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2 = phi i32 [ 4, %entry ], [ %phi2.inc, %loop1 ]
+; CHECK-NEXT:    --> {4,+,2}<nuw><nsw><%loop1> U: [4,333) S: [4,333) Exits: 332 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+; CHECK-NEXT:    --> {6,+,3}<nuw><nsw><%loop1> U: [6,499) S: [6,499) Exits: 498 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi1.inc = add i32 %phi1, 1
+; CHECK-NEXT:    --> {11,+,1}<nuw><nsw><%loop1> U: [11,176) S: [11,176) Exits: 175 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2.inc = add i32 %phi2, 2
+; CHECK-NEXT:    --> {6,+,2}<nuw><nsw><%loop1> U: [6,335) S: [6,335) Exits: 334 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi3.inc = add i32 %phi3, 3
+; CHECK-NEXT:    --> {9,+,3}<nuw><nsw><%loop1> U: [9,502) S: [9,502) Exits: 501 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT:    --> {14,+,3}<%loop1> U: [14,507) S: [14,507) Exits: 506 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT:    --> {20,+,6}<%loop1> U: [20,1005) S: [20,1005) Exits: 1004 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ]
+; CHECK-NEXT:    --> {63,+,1}<nuw><nsw><%loop2> U: [63,205) S: [63,205) Exits: 204 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ]
+; CHECK-NEXT:    --> {53,+,2}<nuw><nsw><%loop2> U: [53,336) S: [53,336) Exits: 335 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+; CHECK-NEXT:    --> {43,+,3}<nuw><nsw><%loop2> U: [43,467) S: [43,467) Exits: 466 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi4.inc = add i32 %phi4, 1
+; CHECK-NEXT:    --> {64,+,1}<nuw><nsw><%loop2> U: [64,206) S: [64,206) Exits: 205 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi5.inc = add i32 %phi5, 2
+; CHECK-NEXT:    --> {55,+,2}<nuw><nsw><%loop2> U: [55,338) S: [55,338) Exits: 337 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi6.inc = add i32 %phi6, 3
+; CHECK-NEXT:    --> {46,+,3}<nuw><nsw><%loop2> U: [46,470) S: [46,470) Exits: 469 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT:    --> {116,+,3}<%loop2> U: [116,540) S: [116,540) Exits: 539 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT:    --> {159,+,6}<%loop2> U: [159,1006) S: [159,1006) Exits: 1005 LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %s1 = add i32 %phi1, %phi4
+; CHECK-NEXT:    --> {{\{\{}}73,+,1}<nuw><nsw><%loop1>,+,1}<nw><%loop2> U: [73,379) S: [73,379) --> 378 U: [378,379) S: [378,379)
+; CHECK-NEXT:    %s2 = add i32 %phi5, %phi2
+; CHECK-NEXT:    --> {{\{\{}}57,+,2}<nuw><nsw><%loop1>,+,2}<nw><%loop2> U: [57,668) S: [57,668) --> 667 U: [667,668) S: [667,668)
+; CHECK-NEXT:    %s3 = add i32 %sum1, %sum3
+; CHECK-NEXT:    --> {{\{\{}}130,+,3}<%loop1>,+,3}<%loop2> U: [130,1046) S: [130,1046) --> 1045 U: [1045,1046) S: [1045,1046)
+; CHECK-NEXT:    %s4 = add i32 %sum4, %sum2
+; CHECK-NEXT:    --> {{\{\{}}179,+,6}<%loop1>,+,6}<%loop2> U: [179,2010) S: [179,2010) --> 2009 U: [2009,2010) S: [2009,2010)
+; CHECK-NEXT:    %s5 = add i32 %phi3, %sum3
+; CHECK-NEXT:    --> {{\{\{}}122,+,3}<nuw><nsw><%loop1>,+,3}<%loop2> U: [122,1038) S: [122,1038) --> 1037 U: [1037,1038) S: [1037,1038)
+; CHECK-NEXT:    %s6 = add i32 %sum2, %phi6
+; CHECK-NEXT:    --> {{\{\{}}63,+,6}<%loop1>,+,3}<nw><%loop2> U: [63,1471) S: [63,1471) --> 1470 U: [1470,1471) S: [1470,1471)
+; CHECK-NEXT:  Determining loop execution counts for: @test_00
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is i32 141
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i32 141
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is i32 141
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 142
+; CHECK-NEXT:  Loop %loop1: backedge-taken count is i32 164
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i32 164
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is i32 164
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 165
+;
 
 entry:
   br label %loop1
@@ -71,34 +106,68 @@ exit:
 ; in any order.
 
 define void @test_01(i32 %a, i32 %b) {
-
-; CHECK-LABEL: Classifying expressions for: @test_01
-; CHECK:       %sum1 = add i32 %phi1, %phi2
-; CHECK-NEXT:  -->  {(%a + %b),+,3}<%loop1>
-; CHECK:       %sum2 = add i32 %sum1, %phi3
-; CHECK-NEXT:  -->  {(6 + %a + %b),+,6}<%loop1>
-; CHECK:       %is1 = add i32 %sum2, %a
-; CHECK-NEXT:  -->  {(6 + (2 * %a) + %b),+,6}<%loop1>
-; CHECK:       %sum3 = add i32 %phi4, %phi5
-; CHECK-NEXT:  -->  {116,+,3}<%loop2>
-; CHECK:       %sum4 = add i32 %sum3, %phi6
-; CHECK-NEXT:  -->  {159,+,6}<%loop2>
-; CHECK:       %is2 = add i32 %sum4, %b
-; CHECK-NEXT:  -->  {(159 + %b),+,6}<%loop2>
-; CHECK:       %ec2 = add i32 %is1, %is2
-; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
-; CHECK:       %s1 = add i32 %phi1, %is1
-; CHECK-NEXT:  -->  {(6 + (3 * %a) + %b),+,7}<%loop1>
-; CHECK:       %s2 = add i32 %is2, %phi4
-; CHECK-NEXT:  -->  {(222 + %b),+,7}<%loop2>
-; CHECK:       %s3 = add i32 %is1, %phi5
-; CHECK-NEXT:  -->  {{{{}}(59 + (2 * %a) + %b),+,6}<%loop1>,+,2}<nw><%loop2>
-; CHECK:       %s4 = add i32 %phi2, %is2
-; CHECK-NEXT:  -->  {{{{}}(159 + (2 * %b)),+,2}<nw><%loop1>,+,6}<%loop2>
-; CHECK:       %s5 = add i32 %is1, %is2
-; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
-; CHECK:       %s6 = add i32 %is2, %is1
-; CHECK-NEXT:  -->  {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2>
+; CHECK-LABEL: 'test_01'
+; CHECK-NEXT:  Classifying expressions for: @test_01
+; CHECK-NEXT:    %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+; CHECK-NEXT:    --> {%a,+,1}<nw><%loop1> U: full-set S: full-set Exits: (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))) + %a) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ]
+; CHECK-NEXT:    --> {%b,+,2}<nw><%loop1> U: full-set S: full-set Exits: ((2 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))<nuw><nsw> + %b) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+; CHECK-NEXT:    --> {6,+,3}<nuw><nsw><%loop1> U: [6,508) S: [6,508) Exits: (6 + (3 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))<nuw>)<nuw> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi1.inc = add i32 %phi1, 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<nw><%loop1> U: full-set S: full-set Exits: (1 + ((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))) + %a) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2.inc = add i32 %phi2, 2
+; CHECK-NEXT:    --> {(2 + %b),+,2}<nw><%loop1> U: full-set S: full-set Exits: (2 + (2 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))<nuw><nsw> + %b) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi3.inc = add i32 %phi3, 3
+; CHECK-NEXT:    --> {9,+,3}<nuw><nsw><%loop1> U: [9,511) S: [9,511) Exits: (9 + (3 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))<nuw>)<nuw> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT:    --> {(%a + %b),+,3}<%loop1> U: full-set S: full-set Exits: ((3 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))<nuw> + %a + %b) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT:    --> {(6 + %a + %b),+,6}<%loop1> U: full-set S: full-set Exits: (6 + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + %a + %b) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %is1 = add i32 %sum2, %a
+; CHECK-NEXT:    --> {(6 + (2 * %a) + %b),+,6}<%loop1> U: full-set S: full-set Exits: (6 + (2 * %a) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + %b) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ]
+; CHECK-NEXT:    --> {63,+,1}<nuw><nsw><%loop2> U: [63,231) S: [63,231) Exits: (63 + ((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ]
+; CHECK-NEXT:    --> {53,+,2}<nuw><nsw><%loop2> U: [53,388) S: [53,388) Exits: (53 + (2 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))<nuw><nsw>)<nuw><nsw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+; CHECK-NEXT:    --> {43,+,3}<nuw><nsw><%loop2> U: [43,545) S: [43,545) Exits: (43 + (3 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))<nuw>)<nuw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi4.inc = add i32 %phi4, 1
+; CHECK-NEXT:    --> {64,+,1}<nuw><nsw><%loop2> U: [64,232) S: [64,232) Exits: (64 + ((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi5.inc = add i32 %phi5, 2
+; CHECK-NEXT:    --> {55,+,2}<nuw><nsw><%loop2> U: [55,390) S: [55,390) Exits: (55 + (2 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))<nuw><nsw>)<nuw><nsw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi6.inc = add i32 %phi6, 3
+; CHECK-NEXT:    --> {46,+,3}<nuw><nsw><%loop2> U: [46,548) S: [46,548) Exits: (46 + (3 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))<nuw>)<nuw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT:    --> {116,+,3}<%loop2> U: [116,618) S: [116,618) Exits: (116 + (3 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))<nuw>)<nuw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT:    --> {159,+,6}<%loop2> U: [159,1162) S: [159,1162) Exits: (159 + (6 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %is2 = add i32 %sum4, %b
+; CHECK-NEXT:    --> {(159 + %b),+,6}<%loop2> U: full-set S: full-set Exits: (159 + (6 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))) + %b) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %ec2 = add i32 %is1, %is2
+; CHECK-NEXT:    --> {{\{\{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2> U: full-set S: full-set --> {(165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))),+,6}<%loop2> U: full-set S: full-set Exits: (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (6 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %s1 = add i32 %phi1, %is1
+; CHECK-NEXT:    --> {(6 + (3 * %a) + %b),+,7}<%loop1> U: full-set S: full-set --> (6 + (3 * %a) + (7 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + %b) U: full-set S: full-set
+; CHECK-NEXT:    %s2 = add i32 %is2, %phi4
+; CHECK-NEXT:    --> {(222 + %b),+,7}<%loop2> U: full-set S: full-set --> (222 + (7 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))) + %b) U: full-set S: full-set
+; CHECK-NEXT:    %s3 = add i32 %is1, %phi5
+; CHECK-NEXT:    --> {{\{\{}}(59 + (2 * %a) + %b),+,6}<%loop1>,+,2}<nw><%loop2> U: full-set S: full-set --> (59 + (2 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))<nuw><nsw> + (2 * %a) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + %b) U: full-set S: full-set
+; CHECK-NEXT:    %s4 = add i32 %phi2, %is2
+; CHECK-NEXT:    --> {{\{\{}}(159 + (2 * %b)),+,2}<nw><%loop1>,+,6}<%loop2> U: full-set S: full-set --> (159 + (2 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))<nuw><nsw> + (2 * %b) + (6 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))) U: full-set S: full-set
+; CHECK-NEXT:    %s5 = add i32 %is1, %is2
+; CHECK-NEXT:    --> {{\{\{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2> U: full-set S: full-set --> (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (6 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))) U: full-set S: full-set
+; CHECK-NEXT:    %s6 = add i32 %is2, %is1
+; CHECK-NEXT:    --> {{\{\{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2> U: full-set S: full-set --> (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (6 * (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))))))) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @test_01
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i32 167
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is (((-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (-1 * (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))<nuw><nsw> + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))))))))) /u 6) + (1 umin (-165 + (-6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))) + (-2 * %a) + (-2 * %b) + (1000 umax (165 + (2 * %a) + (2 * %b) + (6 * (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))))))))
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop1: backedge-taken count is (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i32 167
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is (((-6 + (-2 * %a) + (-1 * (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))<nuw><nsw> + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b))) /u 6) + (1 umin (-6 + (-2 * %a) + (-1 * %b) + (1000 umax (6 + (2 * %a) + %b)))))
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 1
+;
 
 entry:
   br label %loop1
@@ -144,36 +213,72 @@ exit:
 ; loops in any order.
 
 define void @test_02(i32 %a, i32 %b, ptr %p) {
-
-; CHECK-LABEL: Classifying expressions for: @test_02
-; CHECK:       %sum1 = add i32 %phi1, %phi2
-; CHECK-NEXT:  -->  {(%a + %b),+,3}<%loop1>
-; CHECK:       %sum2 = add i32 %sum1, %phi3
-; CHECK-NEXT:  -->  {(6 + %a + %b),+,6}<%loop1>
-; CHECK:       %is1 = add i32 %sum2, %v1
-; CHECK-NEXT:  -->  ({(6 + %a + %b),+,6}<%loop1> + %v1)
-; CHECK:       %sum3 = add i32 %phi4, %phi5
-; CHECK-NEXT:  -->  {(%a + %b),+,3}<%loop2>
-; CHECK:       %sum4 = add i32 %sum3, %phi6
-; CHECK-NEXT:  -->  {(43 + %a + %b),+,6}<%loop2>
-; CHECK:       %is2 = add i32 %sum4, %v2
-; CHECK-NEXT:  -->  ({(43 + %a + %b),+,6}<%loop2> + %v2)
-; CHECK:       %is3 = add i32 %v1, %sum2
-; CHECK-NEXT:  -->  ({(6 + %a + %b),+,6}<%loop1> + %v1)
-; CHECK:       %ec2 = add i32 %is1, %is3
-; CHECK-NEXT:  -->  (2 * ({(6 + %a + %b),+,6}<%loop1> + %v1))
-; CHECK:       %s1 = add i32 %phi1, %is1
-; CHECK-NEXT:  -->  ({(6 + (2 * %a) + %b),+,7}<%loop1> + %v1)
-; CHECK:       %s2 = add i32 %is2, %phi4
-; CHECK-NEXT:  -->  ({(43 + (2 * %a) + %b),+,7}<%loop2> + %v2)
-; CHECK:       %s3 = add i32 %is1, %phi5
-; CHECK-NEXT:  -->  {({(6 + (2 * %b) + %a),+,6}<%loop1> + %v1),+,2}<%loop2>
-; CHECK:       %s4 = add i32 %phi2, %is2
-; CHECK-NEXT:  -->  ({{{{}}(43 + (2 * %b) + %a),+,2}<%loop1>,+,6}<%loop2> + %v2)
-; CHECK:       %s5 = add i32 %is1, %is2
-; CHECK-NEXT:  -->  ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2)
-; CHECK:       %s6 = add i32 %is2, %is1
-; CHECK-NEXT:  -->  ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2)
+; CHECK-LABEL: 'test_02'
+; CHECK-NEXT:  Classifying expressions for: @test_02
+; CHECK-NEXT:    %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ]
+; CHECK-NEXT:    --> {%b,+,2}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ]
+; CHECK-NEXT:    --> {6,+,3}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi1.inc = add i32 %phi1, 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2.inc = add i32 %phi2, 2
+; CHECK-NEXT:    --> {(2 + %b),+,2}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi3.inc = add i32 %phi3, 3
+; CHECK-NEXT:    --> {9,+,3}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %v1 = load i32, ptr %p, align 4
+; CHECK-NEXT:    --> %v1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Variant }
+; CHECK-NEXT:    %sum1 = add i32 %phi1, %phi2
+; CHECK-NEXT:    --> {(%a + %b),+,3}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %sum2 = add i32 %sum1, %phi3
+; CHECK-NEXT:    --> {(6 + %a + %b),+,6}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %is1 = add i32 %sum2, %v1
+; CHECK-NEXT:    --> ({(6 + %a + %b),+,6}<%loop1> + %v1) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Variant }
+; CHECK-NEXT:    %phi4 = phi i32 [ %a, %loop1 ], [ %phi4.inc, %loop2 ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi5 = phi i32 [ %b, %loop1 ], [ %phi5.inc, %loop2 ]
+; CHECK-NEXT:    --> {%b,+,2}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ]
+; CHECK-NEXT:    --> {43,+,3}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi4.inc = add i32 %phi4, 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi5.inc = add i32 %phi5, 2
+; CHECK-NEXT:    --> {(2 + %b),+,2}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi6.inc = add i32 %phi6, 3
+; CHECK-NEXT:    --> {46,+,3}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %v2 = load i32, ptr %p, align 4
+; CHECK-NEXT:    --> %v2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %sum3 = add i32 %phi4, %phi5
+; CHECK-NEXT:    --> {(%a + %b),+,3}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %sum4 = add i32 %sum3, %phi6
+; CHECK-NEXT:    --> {(43 + %a + %b),+,6}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %is2 = add i32 %sum4, %v2
+; CHECK-NEXT:    --> ({(43 + %a + %b),+,6}<%loop2> + %v2) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %is3 = add i32 %v1, %sum2
+; CHECK-NEXT:    --> ({(6 + %a + %b),+,6}<%loop1> + %v1) U: full-set S: full-set Exits: ({(6 + %a + %b),+,6}<%loop1> + %v1) LoopDispositions: { %loop2: Invariant }
+; CHECK-NEXT:    %ec2 = add i32 %is1, %is3
+; CHECK-NEXT:    --> (2 * ({(6 + %a + %b),+,6}<%loop1> + %v1)) U: [0,-1) S: [-2147483648,2147483647) Exits: (2 * ({(6 + %a + %b),+,6}<%loop1> + %v1)) LoopDispositions: { %loop2: Invariant }
+; CHECK-NEXT:    %s1 = add i32 %phi1, %is1
+; CHECK-NEXT:    --> ({(6 + (2 * %a) + %b),+,7}<%loop1> + %v1) U: full-set S: full-set
+; CHECK-NEXT:    %s2 = add i32 %is2, %phi4
+; CHECK-NEXT:    --> ({(43 + (2 * %a) + %b),+,7}<%loop2> + %v2) U: full-set S: full-set
+; CHECK-NEXT:    %s3 = add i32 %is1, %phi5
+; CHECK-NEXT:    --> {({(6 + (2 * %b) + %a),+,6}<%loop1> + %v1),+,2}<%loop2> U: full-set S: full-set
+; CHECK-NEXT:    %s4 = add i32 %phi2, %is2
+; CHECK-NEXT:    --> ({{\{\{}}(43 + (2 * %b) + %a),+,2}<%loop1>,+,6}<%loop2> + %v2) U: full-set S: full-set
+; CHECK-NEXT:    %s5 = add i32 %is1, %is2
+; CHECK-NEXT:    --> ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2) U: full-set S: full-set
+; CHECK-NEXT:    %s6 = add i32 %is2, %is1
+; CHECK-NEXT:    --> ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @test_02
+; CHECK-NEXT:  Loop %loop2: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop2: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop2: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop1: Unpredictable symbolic max backedge-taken count.
+;
 
 entry:
   br label %loop1
@@ -224,16 +329,33 @@ exit:
 ; because we cannot prove for sure that it doesn't use Phis of loop 2.
 
 define void @test_03(i32 %a, i32 %b, i32 %c, ptr %p) {
-
-; CHECK-LABEL: Classifying expressions for: @test_03
-; CHECK:       %v1 = load i32, ptr %p
-; CHECK-NEXT:  -->  %v1
-; CHECK:       %s1 = add i32 %phi1, %v1
-; CHECK-NEXT:  -->  ({%a,+,1}<%loop1> + %v1)
-; CHECK:       %s2 = add i32 %s1, %b
-; CHECK-NEXT:  -->  ({(%a + %b),+,1}<%loop1> + %v1)
-; CHECK:       %s3 = add i32 %s2, %phi2
-; CHECK-NEXT:  -->  ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1)
+; CHECK-LABEL: 'test_03'
+; CHECK-NEXT:  Classifying expressions for: @test_03
+; CHECK-NEXT:    %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ]
+; CHECK-NEXT:    --> {%a,+,1}<%loop1> U: full-set S: full-set Exits: (%a umax %c) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi1.inc = add i32 %phi1, 1
+; CHECK-NEXT:    --> {(1 + %a),+,1}<%loop1> U: full-set S: full-set Exits: (1 + (%a umax %c)) LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %phi2 = phi i32 [ %a, %loop1 ], [ %phi2.inc, %loop2 ]
+; CHECK-NEXT:    --> {%a,+,2}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %phi2.inc = add i32 %phi2, 2
+; CHECK-NEXT:    --> {(2 + %a),+,2}<%loop2> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %v1 = load i32, ptr %p, align 4
+; CHECK-NEXT:    --> %v1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %s1 = add i32 %phi1, %v1
+; CHECK-NEXT:    --> ({%a,+,1}<%loop1> + %v1) U: full-set S: full-set --> ((%a umax %c) + %v1) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %s2 = add i32 %s1, %b
+; CHECK-NEXT:    --> ({(%a + %b),+,1}<%loop1> + %v1) U: full-set S: full-set --> ((%a umax %c) + %b + %v1) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %s3 = add i32 %s2, %phi2
+; CHECK-NEXT:    --> ({{\{\{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1) U: full-set S: full-set --> ({((%a umax %c) + %a + %b),+,2}<%loop2> + %v1) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:  Determining loop execution counts for: @test_03
+; CHECK-NEXT:  Loop %loop2: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop2: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop2: Unpredictable symbolic max backedge-taken count.
+; CHECK-NEXT:  Loop %loop1: backedge-taken count is ((-1 * %a) + (%a umax %c))
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i32 -1
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is ((-1 * %a) + (%a umax %c))
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 1
+;
 
 entry:
   br label %loop1
@@ -262,29 +384,40 @@ exit:
 ; Another mix of previous use cases that demonstrates that incorrect picking of
 ; a loop for a recurrence may cause a crash of SCEV analysis.
 define void @test_04() {
-
-; CHECK-LABEL: Classifying expressions for: @test_04
-; CHECK:       %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ]
-; CHECK-NEXT:  -->  {2,+,1}<nuw><nsw><%loop1>
-; CHECK:       %tmp2 = trunc i64 %tmp to i32
-; CHECK-NEXT:  -->  {2,+,1}<%loop1>
-; CHECK:       %tmp4 = add nuw nsw i64 %tmp, 1
-; CHECK-NEXT:  -->  {3,+,1}<nuw><%loop1>
-; CHECK:       %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ]
-; CHECK-NEXT:  -->  {2,+,1}<nuw><nsw><%loop2> U: [2,9223372036854775807) S: [2,9223372036854775807)
-; CHECK:       %tmp9 = sext i8 %tmp8 to i64
-; CHECK-NEXT:  -->  (sext i8 %tmp8 to i64) U: [-128,128) S: [-128,128)
-; CHECK:       %tmp10 = sub i64 %tmp9, %tmp7
-; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i64) + {-2,+,-1}<nsw><%loop2>) U: [9223372036854775682,126) S: [9223372036854775682,126)
-; CHECK:       %tmp11 = add i64 %tmp10, undef
-; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i64) + {(-2 + undef),+,-1}<nw><%loop2>)
-; CHECK:       %tmp13 = trunc i64 %tmp11 to i32
-; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i32) + {(-2 + (trunc i64 undef to i32)),+,-1}<%loop2>)
-; CHECK:       %tmp14 = sub i32 %tmp13, %tmp2
+; CHECK-LABEL: 'test_04'
+; CHECK-NEXT:  Classifying expressions for: @test_04
+; CHECK-NEXT:    %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ]
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop1> U: [2,-9223372036854775808) S: [2,-9223372036854775808) Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %tmp2 = trunc i64 %tmp to i32
+; CHECK-NEXT:    --> {2,+,1}<%loop1> U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %tmp4 = add nuw nsw i64 %tmp, 1
+; CHECK-NEXT:    --> {3,+,1}<nuw><%loop1> U: [3,0) S: [3,0) Exits: <<Unknown>> LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ]
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop2> U: [2,9223372036854775807) S: [2,9223372036854775807) Exits: (-1 + (3 smax {2,+,1}<nuw><nsw><%loop1>))<nsw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %tmp8 = load i8, ptr addrspace(1) undef, align 1
+; CHECK-NEXT:    --> %tmp8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %tmp9 = sext i8 %tmp8 to i64
+; CHECK-NEXT:    --> (sext i8 %tmp8 to i64) U: [-128,128) S: [-128,128) Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %tmp10 = sub i64 %tmp9, %tmp7
+; CHECK-NEXT:    --> ((sext i8 %tmp8 to i64) + {-2,+,-1}<nsw><%loop2>) U: [9223372036854775682,126) S: [9223372036854775682,126) Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %tmp11 = add i64 %tmp10, undef
+; CHECK-NEXT:    --> ((sext i8 %tmp8 to i64) + {(-2 + undef),+,-1}<nw><%loop2>) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %tmp13 = trunc i64 %tmp11 to i32
+; CHECK-NEXT:    --> ((sext i8 %tmp8 to i32) + {(-2 + (trunc i64 undef to i32)),+,-1}<%loop2>) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %tmp14 = sub i32 %tmp13, %tmp2
+; CHECK-NEXT:    --> ((sext i8 %tmp8 to i32) + {{\{\{}}(-4 + (trunc i64 undef to i32)),+,-1}<%loop1>,+,-1}<%loop2>) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %loop2: Variant }
+; CHECK-NEXT:    %tmp15 = add nuw nsw i64 %tmp7, 1
+; CHECK-NEXT:    --> {3,+,1}<nuw><nsw><%loop2> U: [3,-9223372036854775808) S: [3,-9223372036854775808) Exits: (3 smax {2,+,1}<nuw><nsw><%loop1>) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_04
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is (-3 + (3 smax {2,+,1}<nuw><nsw><%loop1>))<nsw>
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i64 9223372036854775804
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is (-3 + (3 smax {2,+,1}<nuw><nsw><%loop1>))<nsw>
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %loop1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %loop1: Unpredictable symbolic max backedge-taken count.
+;
 ; `{{[{][{]}}` is the ugliness needed to match `{{`
-; CHECK-NEXT:  -->  ((sext i8 %tmp8 to i32) + {{[{][{]}}(-4 + (trunc i64 undef to i32)),+,-1}<%loop1>,+,-1}<%loop2>)
-; CHECK:       %tmp15 = add nuw nsw i64 %tmp7, 1
-; CHECK-NEXT:  -->  {3,+,1}<nuw><nsw><%loop2>
 
 bb:
   br label %loop1
@@ -319,14 +452,28 @@ loop2:
 ; Demonstrate a situation when we can add two recs with different degrees from
 ; the same loop.
 define void @test_05(i32 %N) {
-
-; CHECK-LABEL: Classifying expressions for: @test_05
-; CHECK:       %SQ = mul i32 %i.0, %i.0
-; CHECK-NEXT:  -->  {4,+,5,+,2}<%bb3>
-; CHECK:       %tmp4 = mul i32 %i.0, 2
-; CHECK-NEXT:  -->  {4,+,2}<nuw><nsw><%bb3>
-; CHECK:       %tmp5 = sub i32 %SQ, %tmp4
-; CHECK-NEXT:  -->  {0,+,3,+,2}<%bb3>
+; CHECK-LABEL: 'test_05'
+; CHECK-NEXT:  Classifying expressions for: @test_05
+; CHECK-NEXT:    %"alloca point" = bitcast i32 0 to i32
+; CHECK-NEXT:    --> 0 U: [0,1) S: [0,1)
+; CHECK-NEXT:    %tmp = getelementptr [1000 x i32], ptr @A, i32 0, i32 %i.0
+; CHECK-NEXT:    --> {(8 + @A)<nuw><nsw>,+,4}<nw><%bb3> U: [0,-3) S: [-9223372036854775808,9223372036854775805) Exits: (408 + @A) LoopDispositions: { %bb3: Computable }
+; CHECK-NEXT:    %tmp2 = add i32 %i.0, 1
+; CHECK-NEXT:    --> {3,+,1}<nuw><nsw><%bb3> U: [3,104) S: [3,104) Exits: 103 LoopDispositions: { %bb3: Computable }
+; CHECK-NEXT:    %i.0 = phi i32 [ 2, %entry ], [ %tmp2, %bb ]
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%bb3> U: [2,103) S: [2,103) Exits: 102 LoopDispositions: { %bb3: Computable }
+; CHECK-NEXT:    %SQ = mul i32 %i.0, %i.0
+; CHECK-NEXT:    --> {4,+,5,+,2}<%bb3> U: full-set S: full-set Exits: 10404 LoopDispositions: { %bb3: Computable }
+; CHECK-NEXT:    %tmp4 = mul i32 %i.0, 2
+; CHECK-NEXT:    --> {4,+,2}<nuw><nsw><%bb3> U: [4,205) S: [4,205) Exits: 204 LoopDispositions: { %bb3: Computable }
+; CHECK-NEXT:    %tmp5 = sub i32 %SQ, %tmp4
+; CHECK-NEXT:    --> {0,+,3,+,2}<%bb3> U: full-set S: full-set Exits: 10200 LoopDispositions: { %bb3: Computable }
+; CHECK-NEXT:  Determining loop execution counts for: @test_05
+; CHECK-NEXT:  Loop %bb3: backedge-taken count is i32 100
+; CHECK-NEXT:  Loop %bb3: constant max backedge-taken count is i32 100
+; CHECK-NEXT:  Loop %bb3: symbolic max backedge-taken count is i32 100
+; CHECK-NEXT:  Loop %bb3: Trip multiple is 101
+;
 
 entry:
         %"alloca point" = bitcast i32 0 to i32           ; <i32> [#uses=0]
@@ -356,20 +503,46 @@ return:         ; preds = %bb5
 ; Check that we can add Phis from different loops with different nesting, nested
 ; loop comes first.
 define void @test_06() {
-
-; CHECK-LABEL: Classifying expressions for: @test_06
-; CHECK:       %s1 = add i32 %phi1, %phi2
-; CHECK-NEXT:  -->  {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2>
-; CHECK:       %s2 = add i32 %phi2, %phi1
-; CHECK-NEXT:  -->  {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2>
-; CHECK:       %s3 = add i32 %phi1, %phi3
-; CHECK-NEXT:  -->  {{\{\{}}40,+,1}<nuw><nsw><%loop1>,+,3}<nw><%loop3>
-; CHECK:       %s4 = add i32 %phi3, %phi1
-; CHECK-NEXT:  -->  {{\{\{}}40,+,1}<nuw><nsw><%loop1>,+,3}<nw><%loop3>
-; CHECK:       %s5 = add i32 %phi2, %phi3
-; CHECK-NEXT:  -->  {{\{\{}}50,+,2}<nuw><nsw><%loop2>,+,3}<nw><%loop3>
-; CHECK:       %s6 = add i32 %phi3, %phi2
-; CHECK-NEXT:  -->  {{\{\{}}50,+,2}<nuw><nsw><%loop2>,+,3}<nw><%loop3>
+; CHECK-LABEL: 'test_06'
+; CHECK-NEXT:  Classifying expressions for: @test_06
+; CHECK-NEXT:    %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1.exit ]
+; CHECK-NEXT:    --> {10,+,1}<nuw><nsw><%loop1> U: [10,1000) S: [10,1000) Exits: 999 LoopDispositions: { %loop1: Computable, %loop2: Invariant }
+; CHECK-NEXT:    %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ]
+; CHECK-NEXT:    --> {20,+,2}<nuw><nsw><%loop2> U: [20,999) S: [20,999) Exits: 998 LoopDispositions: { %loop2: Computable, %loop1: Variant }
+; CHECK-NEXT:    %phi2.inc = add i32 %phi2, 2
+; CHECK-NEXT:    --> {22,+,2}<nuw><nsw><%loop2> U: [22,1001) S: [22,1001) Exits: 1000 LoopDispositions: { %loop2: Computable, %loop1: Variant }
+; CHECK-NEXT:    %phi1.inc = add i32 %phi1, 1
+; CHECK-NEXT:    --> {11,+,1}<nuw><nsw><%loop1> U: [11,1001) S: [11,1001) Exits: 1000 LoopDispositions: { %loop1: Computable, %loop2: Invariant }
+; CHECK-NEXT:    %phi3 = phi i32 [ 30, %loop1.exit ], [ %phi3.inc, %loop3 ]
+; CHECK-NEXT:    --> {30,+,3}<nuw><nsw><%loop3> U: [30,1000) S: [30,1000) Exits: 999 LoopDispositions: { %loop3: Computable }
+; CHECK-NEXT:    %phi3.inc = add i32 %phi3, 3
+; CHECK-NEXT:    --> {33,+,3}<nuw><nsw><%loop3> U: [33,1003) S: [33,1003) Exits: 1002 LoopDispositions: { %loop3: Computable }
+; CHECK-NEXT:    %s1 = add i32 %phi1, %phi2
+; CHECK-NEXT:    --> {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2> U: [30,1998) S: [30,1998) --> 1997 U: [1997,1998) S: [1997,1998)
+; CHECK-NEXT:    %s2 = add i32 %phi2, %phi1
+; CHECK-NEXT:    --> {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2> U: [30,1998) S: [30,1998) --> 1997 U: [1997,1998) S: [1997,1998)
+; CHECK-NEXT:    %s3 = add i32 %phi1, %phi3
+; CHECK-NEXT:    --> {{\{\{}}40,+,1}<nuw><nsw><%loop1>,+,3}<nw><%loop3> U: [40,1999) S: [40,1999) --> 1998 U: [1998,1999) S: [1998,1999)
+; CHECK-NEXT:    %s4 = add i32 %phi3, %phi1
+; CHECK-NEXT:    --> {{\{\{}}40,+,1}<nuw><nsw><%loop1>,+,3}<nw><%loop3> U: [40,1999) S: [40,1999) --> 1998 U: [1998,1999) S: [1998,1999)
+; CHECK-NEXT:    %s5 = add i32 %phi2, %phi3
+; CHECK-NEXT:    --> {{\{\{}}50,+,2}<nuw><nsw><%loop2>,+,3}<nw><%loop3> U: [50,1998) S: [50,1998) --> 1997 U: [1997,1998) S: [1997,1998)
+; CHECK-NEXT:    %s6 = add i32 %phi3, %phi2
+; CHECK-NEXT:    --> {{\{\{}}50,+,2}<nuw><nsw><%loop2>,+,3}<nw><%loop3> U: [50,1998) S: [50,1998) --> 1997 U: [1997,1998) S: [1997,1998)
+; CHECK-NEXT:  Determining loop execution counts for: @test_06
+; CHECK-NEXT:  Loop %loop3: backedge-taken count is i32 323
+; CHECK-NEXT:  Loop %loop3: constant max backedge-taken count is i32 323
+; CHECK-NEXT:  Loop %loop3: symbolic max backedge-taken count is i32 323
+; CHECK-NEXT:  Loop %loop3: Trip multiple is 324
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is i32 489
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i32 489
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is i32 489
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 490
+; CHECK-NEXT:  Loop %loop1: backedge-taken count is i32 989
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i32 989
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is i32 989
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 990
+;
 
 entry:
   br label %loop1
@@ -408,20 +581,46 @@ exit:
 ; Check that we can add Phis from different loops with different nesting, nested
 ; loop comes second.
 define void @test_07() {
-
-; CHECK-LABEL: Classifying expressions for: @test_07
-; CHECK:       %s1 = add i32 %phi1, %phi2
-; CHECK-NEXT:  -->  {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2>
-; CHECK:       %s2 = add i32 %phi2, %phi1
-; CHECK-NEXT:  -->  {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2>
-; CHECK:       %s3 = add i32 %phi1, %phi3
-; CHECK-NEXT:  -->  {{\{\{}}40,+,3}<nuw><nsw><%loop3>,+,1}<nw><%loop1>
-; CHECK:       %s4 = add i32 %phi3, %phi1
-; CHECK-NEXT:  -->  {{\{\{}}40,+,3}<nuw><nsw><%loop3>,+,1}<nw><%loop1>
-; CHECK:       %s5 = add i32 %phi2, %phi3
-; CHECK-NEXT:  -->  {{\{\{}}50,+,3}<nuw><nsw><%loop3>,+,2}<nw><%loop2>
-; CHECK:       %s6 = add i32 %phi3, %phi2
-; CHECK-NEXT:  -->  {{\{\{}}50,+,3}<nuw><nsw><%loop3>,+,2}<nw><%loop2>
+; CHECK-LABEL: 'test_07'
+; CHECK-NEXT:  Classifying expressions for: @test_07
+; CHECK-NEXT:    %phi3 = phi i32 [ 30, %entry ], [ %phi3.inc, %loop3 ]
+; CHECK-NEXT:    --> {30,+,3}<nuw><nsw><%loop3> U: [30,1000) S: [30,1000) Exits: 999 LoopDispositions: { %loop3: Computable }
+; CHECK-NEXT:    %phi3.inc = add i32 %phi3, 3
+; CHECK-NEXT:    --> {33,+,3}<nuw><nsw><%loop3> U: [33,1003) S: [33,1003) Exits: 1002 LoopDispositions: { %loop3: Computable }
+; CHECK-NEXT:    %phi1 = phi i32 [ 10, %loop3 ], [ %phi1.inc, %loop1.exit ]
+; CHECK-NEXT:    --> {10,+,1}<nuw><nsw><%loop1> U: [10,11) S: [10,11) Exits: 10 LoopDispositions: { %loop1: Computable, %loop2: Invariant }
+; CHECK-NEXT:    %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ]
+; CHECK-NEXT:    --> {20,+,2}<nuw><nsw><%loop2> U: [20,999) S: [20,999) Exits: 998 LoopDispositions: { %loop2: Computable, %loop1: Variant }
+; CHECK-NEXT:    %phi2.inc = add i32 %phi2, 2
+; CHECK-NEXT:    --> {22,+,2}<nuw><nsw><%loop2> U: [22,1001) S: [22,1001) Exits: 1000 LoopDispositions: { %loop2: Computable, %loop1: Variant }
+; CHECK-NEXT:    %phi1.inc = add i32 %phi1, 1
+; CHECK-NEXT:    --> {11,+,1}<nuw><nsw><%loop1> U: [11,12) S: [11,12) Exits: 11 LoopDispositions: { %loop1: Computable, %loop2: Invariant }
+; CHECK-NEXT:    %s1 = add i32 %phi1, %phi2
+; CHECK-NEXT:    --> {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2> U: [30,1009) S: [30,1009) --> 1008 U: [1008,1009) S: [1008,1009)
+; CHECK-NEXT:    %s2 = add i32 %phi2, %phi1
+; CHECK-NEXT:    --> {{\{\{}}30,+,1}<nuw><nsw><%loop1>,+,2}<nw><%loop2> U: [30,1009) S: [30,1009) --> 1008 U: [1008,1009) S: [1008,1009)
+; CHECK-NEXT:    %s3 = add i32 %phi1, %phi3
+; CHECK-NEXT:    --> {{\{\{}}40,+,3}<nuw><nsw><%loop3>,+,1}<nw><%loop1> U: [40,1010) S: [40,1010) --> 1009 U: [1009,1010) S: [1009,1010)
+; CHECK-NEXT:    %s4 = add i32 %phi3, %phi1
+; CHECK-NEXT:    --> {{\{\{}}40,+,3}<nuw><nsw><%loop3>,+,1}<nw><%loop1> U: [40,1010) S: [40,1010) --> 1009 U: [1009,1010) S: [1009,1010)
+; CHECK-NEXT:    %s5 = add i32 %phi2, %phi3
+; CHECK-NEXT:    --> {{\{\{}}50,+,3}<nuw><nsw><%loop3>,+,2}<nw><%loop2> U: [50,1998) S: [50,1998) --> 1997 U: [1997,1998) S: [1997,1998)
+; CHECK-NEXT:    %s6 = add i32 %phi3, %phi2
+; CHECK-NEXT:    --> {{\{\{}}50,+,3}<nuw><nsw><%loop3>,+,2}<nw><%loop2> U: [50,1998) S: [50,1998) --> 1997 U: [1997,1998) S: [1997,1998)
+; CHECK-NEXT:  Determining loop execution counts for: @test_07
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is i32 489
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i32 489
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is i32 489
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 490
+; CHECK-NEXT:  Loop %loop1: backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop3: backedge-taken count is i32 323
+; CHECK-NEXT:  Loop %loop3: constant max backedge-taken count is i32 323
+; CHECK-NEXT:  Loop %loop3: symbolic max backedge-taken count is i32 323
+; CHECK-NEXT:  Loop %loop3: Trip multiple is 324
+;
 
 entry:
   br label %loop3
@@ -460,16 +659,50 @@ exit:
 ; Make sure that a complicated Phi does not get folded with rec's start value
 ; of a loop which is above.
 define void @test_08() {
-
-; CHECK-LABEL: Classifying expressions for: @test_08
-; CHECK:       %tmp11 = add i64 %iv.2.2, %iv.2.1
-; CHECK-NEXT:  -->  ({0,+,-1}<nuw><nsw><%loop_2> + %iv.2.1)
-; CHECK:       %tmp12 = trunc i64 %tmp11 to i32
-; CHECK-NEXT:  -->  ((trunc i64 %iv.2.1 to i32) + {0,+,-1}<%loop_2>)
-; CHECK:       %tmp14 = mul i32 %tmp12, %tmp7
-; CHECK-NEXT:  -->  (((trunc i64 %iv.2.1 to i32) + {0,+,-1}<%loop_2>) * {-1,+,-1}<%loop_1>)
-; CHECK:       %tmp16 = mul i64 %iv.2.1, %iv.1.1
-; CHECK-NEXT:  -->  ({2,+,1}<nuw><nsw><%loop_1> * %iv.2.1)
+; CHECK-LABEL: 'test_08'
+; CHECK-NEXT:  Classifying expressions for: @test_08
+; CHECK-NEXT:    %iv.1.1 = phi i64 [ 2, %entry ], [ %iv.1.1.next, %loop_1_back_branch ]
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop_1> U: [2,4) S: [2,4) Exits: 3 LoopDispositions: { %loop_1: Computable }
+; CHECK-NEXT:    %iv.1.2 = phi i32 [ -1, %entry ], [ %iv.1.2.next, %loop_1_back_branch ]
+; CHECK-NEXT:    --> {-1,+,1}<nsw><%loop_1> U: [-1,1) S: [-1,1) Exits: 0 LoopDispositions: { %loop_1: Computable }
+; CHECK-NEXT:    %iv.1.1.next = add nuw nsw i64 %iv.1.1, 1
+; CHECK-NEXT:    --> {3,+,1}<nuw><nsw><%loop_1> U: [3,5) S: [3,5) Exits: 4 LoopDispositions: { %loop_1: Computable }
+; CHECK-NEXT:    %iv.1.2.next = add nsw i32 %iv.1.2, 1
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%loop_1> U: [0,2) S: [0,2) Exits: 1 LoopDispositions: { %loop_1: Computable }
+; CHECK-NEXT:    %tmp6 = sub i64 1, %iv.1.1
+; CHECK-NEXT:    --> {-1,+,-1}<nsw><%loop_1> U: [-2,0) S: [-2,0) --> -2 U: [-2,-1) S: [-2,-1)
+; CHECK-NEXT:    %tmp7 = trunc i64 %tmp6 to i32
+; CHECK-NEXT:    --> {-1,+,-1}<%loop_1> U: [-2,0) S: [-2,0) --> -2 U: [-2,-1) S: [-2,-1)
+; CHECK-NEXT:    %iv.2.1 = phi i64 [ 0, %loop_2_preheader ], [ %tmp16, %loop_2 ]
+; CHECK-NEXT:    --> %iv.2.1 U: full-set S: full-set Exits: 0 LoopDispositions: { %loop_2: Variant }
+; CHECK-NEXT:    %iv.2.2 = phi i64 [ 0, %loop_2_preheader ], [ %iv.2.2.next, %loop_2 ]
+; CHECK-NEXT:    --> {0,+,-1}<nuw><nsw><%loop_2> U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop_2: Computable }
+; CHECK-NEXT:    %iv.2.3 = phi i64 [ 2, %loop_2_preheader ], [ %iv.2.3.next, %loop_2 ]
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop_2> U: [2,3) S: [2,3) Exits: 2 LoopDispositions: { %loop_2: Computable }
+; CHECK-NEXT:    %tmp11 = add i64 %iv.2.2, %iv.2.1
+; CHECK-NEXT:    --> ({0,+,-1}<nuw><nsw><%loop_2> + %iv.2.1) U: full-set S: full-set Exits: 0 LoopDispositions: { %loop_2: Variant }
+; CHECK-NEXT:    %tmp12 = trunc i64 %tmp11 to i32
+; CHECK-NEXT:    --> ((trunc i64 %iv.2.1 to i32) + {0,+,-1}<%loop_2>) U: full-set S: full-set Exits: 0 LoopDispositions: { %loop_2: Variant }
+; CHECK-NEXT:    %tmp14 = mul i32 %tmp12, %tmp7
+; CHECK-NEXT:    --> (((trunc i64 %iv.2.1 to i32) + {0,+,-1}<%loop_2>) * {-1,+,-1}<%loop_1>) U: full-set S: full-set --> (-2 * ((trunc i64 %iv.2.1 to i32) + {0,+,-1}<%loop_2>)) U: [0,-1) S: [-2147483648,2147483647) Exits: 0 LoopDispositions: { %loop_2: Variant }
+; CHECK-NEXT:    %tmp16 = mul i64 %iv.2.1, %iv.1.1
+; CHECK-NEXT:    --> ({2,+,1}<nuw><nsw><%loop_1> * %iv.2.1) U: full-set S: full-set --> (3 * %iv.2.1) U: full-set S: full-set Exits: 0 LoopDispositions: { %loop_2: Variant }
+; CHECK-NEXT:    %iv.2.3.next = add nuw nsw i64 %iv.2.3, 1
+; CHECK-NEXT:    --> {3,+,1}<nuw><nsw><%loop_2> U: [3,4) S: [3,4) Exits: 3 LoopDispositions: { %loop_2: Computable }
+; CHECK-NEXT:    %iv.2.2.next = add nsw i64 %iv.2.2, -1
+; CHECK-NEXT:    --> {-1,+,-1}<nuw><nsw><%loop_2> U: [-1,0) S: [-1,0) Exits: -1 LoopDispositions: { %loop_2: Computable }
+; CHECK-NEXT:    %tmp10 = add i32 %iv.1.2, 3
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop_1> U: [2,4) S: [2,4) --> 3 U: [3,4) S: [3,4)
+; CHECK-NEXT:  Determining loop execution counts for: @test_08
+; CHECK-NEXT:  Loop %loop_2: backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop_2: constant max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop_2: symbolic max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %loop_2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop_1: backedge-taken count is i64 1
+; CHECK-NEXT:  Loop %loop_1: constant max backedge-taken count is i64 1
+; CHECK-NEXT:  Loop %loop_1: symbolic max backedge-taken count is i64 1
+; CHECK-NEXT:  Loop %loop_1: Trip multiple is 2
+;
 
 entry:
   br label %loop_1
@@ -515,22 +748,36 @@ exit:
 }
 
 define i64 @test_09(i32 %param) {
-
-; CHECK-LABEL: Classifying expressions for: @test_09
-; CHECK:       %iv1 = phi i64 [ %iv1.next, %guarded ], [ 0, %outer.loop ]
-; CHECK-NEXT:    -->  {0,+,1}<nuw><nsw><%loop1>
-; CHECK:       %iv1.trunc = trunc i64 %iv1 to i32
-; CHECK-NEXT:    -->  {0,+,1}<%loop1>
-; CHECK:       %iv1.next = add nuw nsw i64 %iv1, 1
-; CHECK-NEXT:    -->  {1,+,1}<nuw><nsw><%loop1>
-; CHECK:       %iv2 = phi i32 [ %iv2.next, %loop2 ], [ %param, %loop2.preheader ]
-; CHECK-NEXT:    -->  {%param,+,1}<%loop2>
-; CHECK:       %iv2.next = add i32 %iv2, 1
-; CHECK-NEXT:    -->  {(1 + %param),+,1}<%loop2>
-; CHECK:       %iv2.ext = sext i32 %iv2.next to i64
-; CHECK-NEXT:    -->  (sext i32 {(1 + %param),+,1}<%loop2> to i64)
-; CHECK:       %ret = mul i64 %iv1, %iv2.ext
-; CHECK-NEXT:    -->  ((sext i32 {(1 + %param),+,1}<%loop2> to i64) * {0,+,1}<nuw><nsw><%loop1>)
+; CHECK-LABEL: 'test_09'
+; CHECK-NEXT:  Classifying expressions for: @test_09
+; CHECK-NEXT:    %iv1 = phi i64 [ %iv1.next, %guarded ], [ 0, %outer.loop ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%loop1> U: [0,3) S: [0,3) Exits: 2 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %iv1.trunc = trunc i64 %iv1 to i32
+; CHECK-NEXT:    --> {0,+,1}<%loop1> U: [0,3) S: [0,3) Exits: 2 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %iv1.next = add nuw nsw i64 %iv1, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop1> U: [1,4) S: [1,4) Exits: 3 LoopDispositions: { %loop1: Computable }
+; CHECK-NEXT:    %iv2 = phi i32 [ %iv2.next, %loop2 ], [ %param, %loop2.preheader ]
+; CHECK-NEXT:    --> {%param,+,1}<%loop2> U: full-set S: full-set Exits: (2 smax %param) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %iv2.next = add i32 %iv2, 1
+; CHECK-NEXT:    --> {(1 + %param),+,1}<%loop2> U: full-set S: full-set Exits: (1 + (2 smax %param))<nuw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %iv2.ext = sext i32 %iv2.next to i64
+; CHECK-NEXT:    --> (sext i32 {(1 + %param),+,1}<%loop2> to i64) U: [-2147483648,2147483648) S: [-2147483648,2147483648) --> (sext i32 (1 + (2 smax %param))<nuw> to i64) U: [-2147483648,2147483648) S: [-2147483648,2147483648)
+; CHECK-NEXT:    %ret = mul i64 %iv1, %iv2.ext
+; CHECK-NEXT:    --> ((sext i32 {(1 + %param),+,1}<%loop2> to i64) * {0,+,1}<nuw><nsw><%loop1>) U: [-4294967296,4294967295) S: [-4294967296,4294967295) --> (2 * (sext i32 (1 + (2 smax %param))<nuw> to i64))<nsw> U: [0,-1) S: [-4294967296,4294967295)
+; CHECK-NEXT:  Determining loop execution counts for: @test_09
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is ((-1 * %param) + (2 smax %param))
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i32 -2147483646
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is ((-1 * %param) + (2 smax %param))
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop1: <multiple exits> backedge-taken count is i64 2
+; CHECK-NEXT:    exit count for loop1: i64 100
+; CHECK-NEXT:    exit count for guarded: i32 2
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i64 2
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is i64 2
+; CHECK-NEXT:    symbolic max exit count for loop1: i64 100
+; CHECK-NEXT:    symbolic max exit count for guarded: i32 2
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 1
+;
 
 entry:
   br label %outer.loop
@@ -568,26 +815,47 @@ exit:                                          ; preds = %loop2.exit
 }
 
 define i64 @test_10(i32 %param) {
-
-; CHECK-LABEL: Classifying expressions for: @test_10
-; CHECK:       %uncle = phi i64 [ %uncle.outer.next, %uncle.loop.backedge ], [ 0, %outer.loop ]
-; CHECK-NEXT:  -->  {0,+,1}<nuw><nsw><%uncle.loop>
-; CHECK:       %iv1 = phi i64 [ %iv1.next, %guarded ], [ 0, %uncle.loop ]
-; CHECK-NEXT:  -->  {0,+,1}<nuw><nsw><%loop1>
-; CHECK:       %iv1.trunc = trunc i64 %iv1 to i32
-; CHECK-NEXT:  -->  {0,+,1}<%loop1>
-; CHECK:       %iv1.next = add nuw nsw i64 %iv1, 1
-; CHECK-NEXT:  -->  {1,+,1}<nuw><nsw><%loop1>
-; CHECK:       %uncle.outer.next = add i64 %uncle, 1
-; CHECK-NEXT:  -->  {1,+,1}<nuw><nsw><%uncle.loop>
-; CHECK:       %iv2 = phi i32 [ %iv2.next, %loop2 ], [ %param, %loop2.preheader ]
-; CHECK-NEXT:  -->  {%param,+,1}<%loop2>
-; CHECK:       %iv2.next = add i32 %iv2, 1
-; CHECK-NEXT:  -->  {(1 + %param),+,1}<%loop2>
-; CHECK:       %iv2.ext = sext i32 %iv2.next to i64
-; CHECK-NEXT:  -->  (sext i32 {(1 + %param),+,1}<%loop2> to i64)
-; CHECK:       %ret = mul i64 %iv1, %iv2.ext
-; CHECK-NEXT:  -->  ((sext i32 {(1 + %param),+,1}<%loop2> to i64) * {0,+,1}<nuw><nsw><%loop1>)
+; CHECK-LABEL: 'test_10'
+; CHECK-NEXT:  Classifying expressions for: @test_10
+; CHECK-NEXT:    %uncle = phi i64 [ %uncle.outer.next, %uncle.loop.backedge ], [ 0, %outer.loop ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%uncle.loop> U: [0,1) S: [0,1) Exits: <<Unknown>> LoopDispositions: { %uncle.loop: Computable, %loop1: Invariant }
+; CHECK-NEXT:    %iv1 = phi i64 [ %iv1.next, %guarded ], [ 0, %uncle.loop ]
+; CHECK-NEXT:    --> {0,+,1}<nuw><nsw><%loop1> U: [0,3) S: [0,3) Exits: 2 LoopDispositions: { %loop1: Computable, %uncle.loop: Variant }
+; CHECK-NEXT:    %iv1.trunc = trunc i64 %iv1 to i32
+; CHECK-NEXT:    --> {0,+,1}<%loop1> U: [0,3) S: [0,3) Exits: 2 LoopDispositions: { %loop1: Computable, %uncle.loop: Variant }
+; CHECK-NEXT:    %iv1.next = add nuw nsw i64 %iv1, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop1> U: [1,4) S: [1,4) Exits: 3 LoopDispositions: { %loop1: Computable, %uncle.loop: Variant }
+; CHECK-NEXT:    %uncle.outer.next = add i64 %uncle, 1
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%uncle.loop> U: [1,2) S: [1,2) Exits: <<Unknown>> LoopDispositions: { %uncle.loop: Computable, %loop1: Invariant }
+; CHECK-NEXT:    %iv2 = phi i32 [ %iv2.next, %loop2 ], [ %param, %loop2.preheader ]
+; CHECK-NEXT:    --> {%param,+,1}<%loop2> U: full-set S: full-set Exits: (2 smax %param) LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %iv2.next = add i32 %iv2, 1
+; CHECK-NEXT:    --> {(1 + %param),+,1}<%loop2> U: full-set S: full-set Exits: (1 + (2 smax %param))<nuw> LoopDispositions: { %loop2: Computable }
+; CHECK-NEXT:    %iv2.ext = sext i32 %iv2.next to i64
+; CHECK-NEXT:    --> (sext i32 {(1 + %param),+,1}<%loop2> to i64) U: [-2147483648,2147483648) S: [-2147483648,2147483648) --> (sext i32 (1 + (2 smax %param))<nuw> to i64) U: [-2147483648,2147483648) S: [-2147483648,2147483648)
+; CHECK-NEXT:    %ret = mul i64 %iv1, %iv2.ext
+; CHECK-NEXT:    --> ((sext i32 {(1 + %param),+,1}<%loop2> to i64) * {0,+,1}<nuw><nsw><%loop1>) U: [-4294967296,4294967295) S: [-4294967296,4294967295) --> (2 * (sext i32 (1 + (2 smax %param))<nuw> to i64))<nsw> U: [0,-1) S: [-4294967296,4294967295)
+; CHECK-NEXT:  Determining loop execution counts for: @test_10
+; CHECK-NEXT:  Loop %loop2: backedge-taken count is ((-1 * %param) + (2 smax %param))
+; CHECK-NEXT:  Loop %loop2: constant max backedge-taken count is i32 -2147483646
+; CHECK-NEXT:  Loop %loop2: symbolic max backedge-taken count is ((-1 * %param) + (2 smax %param))
+; CHECK-NEXT:  Loop %loop2: Trip multiple is 1
+; CHECK-NEXT:  Loop %loop1: <multiple exits> backedge-taken count is i64 2
+; CHECK-NEXT:    exit count for loop1: i64 100
+; CHECK-NEXT:    exit count for guarded: i32 2
+; CHECK-NEXT:  Loop %loop1: constant max backedge-taken count is i64 2
+; CHECK-NEXT:  Loop %loop1: symbolic max backedge-taken count is i64 2
+; CHECK-NEXT:    symbolic max exit count for loop1: i64 100
+; CHECK-NEXT:    symbolic max exit count for guarded: i32 2
+; CHECK-NEXT:  Loop %loop1: Trip multiple is 1
+; CHECK-NEXT:  Loop %uncle.loop: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for loop1: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    exit count for uncle.loop.backedge: i64 0
+; CHECK-NEXT:  Loop %uncle.loop: constant max backedge-taken count is i64 0
+; CHECK-NEXT:  Loop %uncle.loop: symbolic max backedge-taken count is i64 0
+; CHECK-NEXT:    symbolic max exit count for loop1: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    symbolic max exit count for uncle.loop.backedge: i64 0
+;
 
 entry:
   br label %outer.loop
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-intrinsic-uaddlv.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-intrinsic-uaddlv.mir
deleted file mode 100644
index 9a81493d973aa7..00000000000000
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-intrinsic-uaddlv.mir
+++ /dev/null
@@ -1,109 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
-
-...
----
-name:            uaddlv_v8s8
-legalized:       true
-regBankSelected: true
-body:             |
-  bb.0:
-    liveins: $d0
-    ; CHECK-LABEL: name: uaddlv_v8s8
-    ; CHECK: %copy:fpr64 = COPY $d0
-    ; CHECK: [[UADDLVv8i8v:%[0-9]+]]:fpr16 = UADDLVv8i8v %copy
-    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[UADDLVv8i8v]], %subreg.hsub
-    ; CHECK: %intrin:fpr32 = COPY [[INSERT_SUBREG]].ssub
-    ; CHECK: $w0 = COPY %intrin
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:fpr(<8 x s8>) = COPY $d0
-    %intrin:fpr(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), %copy(<8 x s8>)
-    $w0 = COPY %intrin(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            uaddlv_v16s8
-legalized:       true
-regBankSelected: true
-body:             |
-  bb.0:
-    liveins: $q0
-
-    ; CHECK-LABEL: name: uaddlv_v16s8
-    ; CHECK: %copy:fpr128 = COPY $q0
-    ; CHECK: [[UADDLVv16i8v:%[0-9]+]]:fpr16 = UADDLVv16i8v %copy
-    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[UADDLVv16i8v]], %subreg.hsub
-    ; CHECK: %intrin:fpr32 = COPY [[INSERT_SUBREG]].ssub
-    ; CHECK: $w0 = COPY %intrin
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:fpr(<16 x s8>) = COPY $q0
-    %intrin:fpr(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), %copy(<16 x s8>)
-    $w0 = COPY %intrin(s32)
-    RET_ReallyLR implicit $w0
-...
----
-name:            uaddlv_v4s16
-legalized:       true
-regBankSelected: true
-body:             |
-  bb.0:
-    liveins: $d0
-    ; CHECK-LABEL: name: uaddlv_v4s16
-    ; CHECK: %copy:fpr64 = COPY $d0
-    ; CHECK: [[UADDLVv4i16v:%[0-9]+]]:fpr32 = UADDLVv4i16v %copy
-    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[UADDLVv4i16v]], %subreg.ssub
-    ; CHECK: %intrin:fpr32 = COPY [[INSERT_SUBREG]].ssub
-    ; CHECK: $w0 = COPY %intrin
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:fpr(<4 x s16>) = COPY $d0
-    %intrin:fpr(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), %copy(<4 x s16>)
-    $w0 = COPY %intrin(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            uaddlv_v8s16
-legalized:       true
-regBankSelected: true
-body:             |
-  bb.0:
-    liveins: $q0
-
-    ; CHECK-LABEL: name: uaddlv_v8s16
-    ; CHECK: %copy:fpr128 = COPY $q0
-    ; CHECK: [[UADDLVv8i16v:%[0-9]+]]:fpr32 = UADDLVv8i16v %copy
-    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[UADDLVv8i16v]], %subreg.ssub
-    ; CHECK: %intrin:fpr32 = COPY [[INSERT_SUBREG]].ssub
-    ; CHECK: $w0 = COPY %intrin
-    ; CHECK: RET_ReallyLR implicit $w0
-    %copy:fpr(<8 x s16>) = COPY $q0
-    %intrin:fpr(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), %copy(<8 x s16>)
-    $w0 = COPY %intrin(s32)
-    RET_ReallyLR implicit $w0
-
-...
----
-name:            uaddlv_v4s32
-legalized:       true
-regBankSelected: true
-body:             |
-  bb.0:
-    liveins: $q0
-
-    ; CHECK-LABEL: name: uaddlv_v4s32
-    ; CHECK: %copy:fpr128 = COPY $q0
-    ; CHECK: [[UADDLVv4i32v:%[0-9]+]]:fpr64 = UADDLVv4i32v %copy
-    ; CHECK: [[DEF:%[0-9]+]]:fpr128 = IMPLICIT_DEF
-    ; CHECK: [[INSERT_SUBREG:%[0-9]+]]:fpr128 = INSERT_SUBREG [[DEF]], [[UADDLVv4i32v]], %subreg.dsub
-    ; CHECK: %intrin:fpr64 = COPY [[INSERT_SUBREG]].dsub
-    ; CHECK: $x0 = COPY %intrin
-    ; CHECK: RET_ReallyLR implicit $x0
-    %copy:fpr(<4 x s32>) = COPY $q0
-    %intrin:fpr(s64) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), %copy(<4 x s32>)
-    $x0 = COPY %intrin(s64)
-    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-faminmax.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-faminmax.ll
new file mode 100644
index 00000000000000..dd2deda9839a0d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-faminmax.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+faminmax -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-NO-FAMINMAX
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Replace min(abs(a), abs(b)) with famin(a, b)
+; Replace max(abs(a), abs(b)) with famax(a, b)
+
+define <4 x half> @test_max_v4f16(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: test_max_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_max_v4f16:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.4h, v0.4h
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.4h, v1.4h
+; CHECK-NO-FAMINMAX-NEXT:    fmax v0.4h, v0.4h, v1.4h
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+  %ab = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
+  %r = call <4 x half> @llvm.maximum.v4f16(<4 x half> %aa, <4 x half> %ab)
+  ret <4 x half> %r
+}
+
+define <4 x half> @test_min_v4f16(<4 x half> %a, <4 x half> %b) #0 {
+; CHECK-LABEL: test_min_v4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_min_v4f16:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.4h, v0.4h
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.4h, v1.4h
+; CHECK-NO-FAMINMAX-NEXT:    fmin v0.4h, v0.4h, v1.4h
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a)
+  %ab = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b)
+  %r = call <4 x half> @llvm.minimum.v4f16(<4 x half> %aa, <4 x half> %ab)
+  ret <4 x half> %r
+}
+
+define <8 x half> @test_max_v8f16(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: test_max_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_max_v8f16:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.8h, v0.8h
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.8h, v1.8h
+; CHECK-NO-FAMINMAX-NEXT:    fmax v0.8h, v0.8h, v1.8h
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
+  %ab = call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
+  %r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %aa, <8 x half> %ab)
+  ret <8 x half> %r
+}
+
+define <8 x half> @test_min_v8f16(<8 x half> %a, <8 x half> %b) #0 {
+; CHECK-LABEL: test_min_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_min_v8f16:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.8h, v0.8h
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.8h, v1.8h
+; CHECK-NO-FAMINMAX-NEXT:    fmin v0.8h, v0.8h, v1.8h
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
+  %ab = call <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
+  %r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %aa, <8 x half> %ab)
+  ret <8 x half> %r
+}
+
+define <2 x float> @test_max_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: test_max_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_max_v2f32:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.2s, v0.2s
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.2s, v1.2s
+; CHECK-NO-FAMINMAX-NEXT:    fmax v0.2s, v0.2s, v1.2s
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %ab = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %r = call <2 x float> @llvm.maximum.v2f32(<2 x float> %aa, <2 x float> %ab)
+  ret <2 x float> %r
+}
+
+define <2 x float> @test_min_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: test_min_v2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_min_v2f32:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.2s, v0.2s
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.2s, v1.2s
+; CHECK-NO-FAMINMAX-NEXT:    fmin v0.2s, v0.2s, v1.2s
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %ab = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
+  %r = call <2 x float> @llvm.minimum.v2f32(<2 x float> %aa, <2 x float> %ab)
+  ret <2 x float> %r
+}
+
+define <4 x float> @test_max_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_max_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_max_v4f32:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.4s, v0.4s
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.4s, v1.4s
+; CHECK-NO-FAMINMAX-NEXT:    fmax v0.4s, v0.4s, v1.4s
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
+  %ab = call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
+  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %aa, <4 x float> %ab)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_min_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_min_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_min_v4f32:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.4s, v0.4s
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.4s, v1.4s
+; CHECK-NO-FAMINMAX-NEXT:    fmin v0.4s, v0.4s, v1.4s
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
+  %ab = call <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
+  %r = call <4 x float> @llvm.minimum.v4f32(<4 x float> %aa, <4 x float> %ab)
+  ret <4 x float> %r
+}
+
+define <2 x double> @test_max_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_max_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famax v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_max_v2f64:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.2d, v0.2d
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.2d, v1.2d
+; CHECK-NO-FAMINMAX-NEXT:    fmax v0.2d, v0.2d, v1.2d
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a)
+  %ab = call <2 x double> @llvm.fabs.v2f64(<2 x double> %b)
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %aa, <2 x double> %ab)
+  ret <2 x double> %r
+}
+
+define <2 x double> @test_min_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_min_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    famin v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+;
+; CHECK-NO-FAMINMAX-LABEL: test_min_v2f64:
+; CHECK-NO-FAMINMAX:       // %bb.0:
+; CHECK-NO-FAMINMAX-NEXT:    fabs v0.2d, v0.2d
+; CHECK-NO-FAMINMAX-NEXT:    fabs v1.2d, v1.2d
+; CHECK-NO-FAMINMAX-NEXT:    fmin v0.2d, v0.2d, v1.2d
+; CHECK-NO-FAMINMAX-NEXT:    ret
+  %aa = call <2 x double> @llvm.fabs.v2f64(<2 x double> %a)
+  %ab = call <2 x double> @llvm.fabs.v2f64(<2 x double> %b)
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %aa, <2 x double> %ab)
+  ret <2 x double> %r
+}
+
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>)
+declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
+
+declare <8 x half> @llvm.minimum.v8f16(<8 x half>, <8 x half>)
+declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>)
+declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
+
+declare <8 x half> @llvm.maximum.v8f16(<8 x half>, <8 x half>)
+declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
+declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
+declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
+
+attributes #0 = { nounwind "target-features"="+fullfp16" }
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 3c8aca5145261d..75a549e348d472 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -485,3 +485,101 @@ entry:
   store <4 x float> %2, ptr %0, align 8
   ret void
 }
+
+define void @store_saddlv_v8i8(ptr %H, <8 x i8> %sum_h, i32 %idx) {
+; CHECK-LABEL: store_saddlv_v8i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    saddlv.8b h0, v0
+; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    sbfiz x9, x1, #3, #32
+; CHECK-NEXT:    smov.h w8, v0[0]
+; CHECK-NEXT:    str w8, [x0, x9]
+; CHECK-NEXT:    ret
+entry:
+  %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %sum_h)
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
+  store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
+  ret void
+}
+
+define void @store_saddlv_v16i8(ptr %H, <16 x i8> %sum_h, i32 %idx) {
+; CHECK-LABEL: store_saddlv_v16i8:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    saddlv.16b h0, v0
+; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    sbfiz x9, x1, #3, #32
+; CHECK-NEXT:    smov.h w8, v0[0]
+; CHECK-NEXT:    str w8, [x0, x9]
+; CHECK-NEXT:    ret
+entry:
+  %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %sum_h)
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
+  store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
+  ret void
+}
+
+define void @store_saddlv_v4i16(ptr %H, <4 x i16> %sum_h, i32 %idx) {
+; CHECK-LABEL: store_saddlv_v4i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    saddlv.4h s0, v0
+; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    sbfiz x8, x1, #3, #32
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    str w9, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> %sum_h)
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
+  store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
+  ret void
+}
+
+define void @store_saddlv_v8i16(ptr %H, <8 x i16> %sum_h, i32 %idx) {
+; CHECK-LABEL: store_saddlv_v8i16:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    saddlv.8h s0, v0
+; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    sbfiz x8, x1, #3, #32
+; CHECK-NEXT:    fmov w9, s0
+; CHECK-NEXT:    str w9, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %vaddlvq_s32.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> %sum_h)
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
+  store i32 %vaddlvq_s32.i, ptr %arrayidx, align 8
+  ret void
+}
+
+define void @store_saddlv_v2i32(ptr %H, <2 x i32> %sum_h, i32 %idx) {
+; CHECK-LABEL: store_saddlv_v2i32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    saddlp.1d v0, v0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    str x8, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+entry:
+  %vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> %sum_h)
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
+  store i64 %vaddlvq_s32.i, ptr %arrayidx, align 8
+  ret void
+}
+
+define void @store_saddlv_v4i32(ptr %H, <4 x i32> %sum_h, i32 %idx) {
+; CHECK-LABEL: store_saddlv_v4i32:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    saddlv.4s d0, v0
+; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    str x8, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+entry:
+  %vaddlvq_s32.i = tail call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> %sum_h)
+  %idxprom = sext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i64, ptr %H, i64 %idxprom
+  store i64 %vaddlvq_s32.i, ptr %arrayidx, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
index f7ff64228ecd34..2899197abb2f44 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -1,83 +1,45 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc < %s -global-isel=1 -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-none-linux-gnu -mattr=+neon -global-isel -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 declare float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float>)
-
 declare float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float>)
-
 declare float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float>)
-
 declare float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float>)
-
 declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
-
 declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
-
 declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>)
-
 declare i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8>)
-
 declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>)
-
 declare i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8>)
-
 declare i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8>)
-
 declare i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8>)
-
 declare i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8>)
-
 declare i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8>)
-
 declare i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>)
-
 declare i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32>)
-
 declare i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16>)
-
 declare i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8>)
-
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>)
-
 declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
-
 declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
 
 define i16 @test_vaddlv_s8(<8 x i8> %a) {
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
index 64fb5b36b2c623..1ed6a273338abb 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll
@@ -240,6 +240,10 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .section        .drectve,"yni"
 ; CHECK-NEXT:      .ascii  " /EXPORT:exp"
 
+; CHECK-NEXT:      .def    "EXP+#func";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .def    func;
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -252,6 +256,10 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:  .set "#func", "#func$hybpatch_thunk"{{$}}
+; CHECK-NEXT:      .def    "EXP+#has_varargs";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .def    has_varargs;
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -264,6 +272,10 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:  .set "#has_varargs", "#has_varargs$hybpatch_thunk"
+; CHECK-NEXT:      .def    "EXP+#has_sret";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .def    has_sret;
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -276,6 +288,10 @@ define dso_local void @caller() nounwind {
 ; CHECK-NEXT:      .type   32;
 ; CHECK-NEXT:      .endef
 ; CHECK-NEXT:  .set "#has_sret", "#has_sret$hybpatch_thunk"
+; CHECK-NEXT:      .def    "EXP+#exp";
+; CHECK-NEXT:      .scl    2;
+; CHECK-NEXT:      .type   32;
+; CHECK-NEXT:      .endef
 ; CHECK-NEXT:      .def    exp;
 ; CHECK-NEXT:      .scl    2;
 ; CHECK-NEXT:      .type   32;
@@ -295,18 +311,18 @@ define dso_local void @caller() nounwind {
 ; SYM:      [78](sec 20)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 #exp$hybpatch_thunk
 ; SYM:      [110](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 func
 ; SYM-NEXT: AUX indx 112 srch 3
-; SYM-NEXT: [112](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#func
+; SYM-NEXT: [112](sec  0)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 EXP+#func
 ; SYM:      [116](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #func
 ; SYM-NEXT: AUX indx 53 srch 3
 ; SYM:      [122](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 has_varargs
 ; SYM-NEXT: AUX indx 124 srch 3
-; SYM-NEXT: [124](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#has_varargs
+; SYM-NEXT: [124](sec  0)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 EXP+#has_varargs
 ; SYM-NEXT: [125](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 has_sret
 ; SYM-NEXT: AUX indx 127 srch 3
-; SYM-NEXT: [127](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#has_sret
+; SYM-NEXT: [127](sec  0)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 EXP+#has_sret
 ; SYM-NEXT: [128](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 exp
 ; SYM-NEXT: AUX indx 130 srch 3
-; SYM-NEXT: [130](sec  0)(fl 0x00)(ty   0)(scl   2) (nx 0) 0x00000000 EXP+#exp
+; SYM-NEXT: [130](sec  0)(fl 0x00)(ty  20)(scl   2) (nx 0) 0x00000000 EXP+#exp
 ; SYM-NEXT: [131](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #has_varargs
 ; SYM-NEXT: AUX indx 58 srch 3
 ; SYM-NEXT: [133](sec  0)(fl 0x00)(ty   0)(scl  69) (nx 1) 0x00000000 #has_sret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 1c8a8d635274ec..dcc11609ca2316 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -229,8 +229,8 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x2, x2, x11
-; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p2.b
+; CHECK-NEXT:    and z2.d, z2.d, #0xffffffff
+; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
 ; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
 ; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x0, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 66d670d0b796bf..cdf2a962f93226 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -319,9 +319,8 @@ define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %a) {
 define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: ctz_and_nxv16i1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    cmpne p2.b, p1/z, z0.b, z1.b
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p2.b
 ; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cntp x0, p0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/sme-darwin-no-sve-vg.ll b/llvm/test/CodeGen/AArch64/sme-darwin-no-sve-vg.ll
new file mode 100644
index 00000000000000..36a300fea25e5a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-darwin-no-sve-vg.ll
@@ -0,0 +1,161 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -o - %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx14.0.0"
+
+; Check we don't crash on Darwin and that we don't try to save VG
+; when only SME (and not SVE) is enabled.
+
+; Function Attrs: mustprogress norecurse nounwind ssp uwtable(sync)
+define noundef i32 @main() local_unnamed_addr #0 {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-80]! ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    stp d13, d12, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset b8, -24
+; CHECK-NEXT:    .cfi_offset b9, -32
+; CHECK-NEXT:    .cfi_offset b10, -40
+; CHECK-NEXT:    .cfi_offset b11, -48
+; CHECK-NEXT:    .cfi_offset b12, -56
+; CHECK-NEXT:    .cfi_offset b13, -64
+; CHECK-NEXT:    .cfi_offset b14, -72
+; CHECK-NEXT:    .cfi_offset b15, -80
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    bl __ZL9sme_crashv
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    mov w0, #0 ; =0x0
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #80 ; 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+entry:
+  tail call fastcc void @_ZL9sme_crashv() #4
+  ret i32 0
+}
+
+; Function Attrs: mustprogress norecurse nounwind ssp uwtable(sync)
+define internal fastcc void @_ZL9sme_crashv() unnamed_addr #1 {
+; CHECK-LABEL: _ZL9sme_crashv:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    stp d15, d14, [sp, #-96]! ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    stp d13, d12, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #80
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset w27, -24
+; CHECK-NEXT:    .cfi_offset w28, -32
+; CHECK-NEXT:    .cfi_offset b8, -40
+; CHECK-NEXT:    .cfi_offset b9, -48
+; CHECK-NEXT:    .cfi_offset b10, -56
+; CHECK-NEXT:    .cfi_offset b11, -64
+; CHECK-NEXT:    .cfi_offset b12, -72
+; CHECK-NEXT:    .cfi_offset b13, -80
+; CHECK-NEXT:    .cfi_offset b14, -88
+; CHECK-NEXT:    .cfi_offset b15, -96
+; CHECK-NEXT:    .cfi_remember_state
+; CHECK-NEXT:    sub x9, sp, #160
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffff00
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, ___stack_chk_guard@GOTPAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF]
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    ldr x8, [x8]
+; CHECK-NEXT:    str x8, [sp, #152]
+; CHECK-NEXT:    mov z0.b, #0 ; =0x0
+; CHECK-NEXT:    stp q0, q0, [sp, #32]
+; CHECK-NEXT:    stp q0, q0, [sp]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ; InlineAsm Start
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-EMPTY:
+; CHECK-NEXT:    ; InlineAsm End
+; CHECK-NEXT:    ldr x8, [sp, #152]
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    adrp x9, ___stack_chk_guard@GOTPAGE
+; CHECK-NEXT:  Lloh4:
+; CHECK-NEXT:    ldr x9, [x9, ___stack_chk_guard@GOTPAGEOFF]
+; CHECK-NEXT:  Lloh5:
+; CHECK-NEXT:    ldr x9, [x9]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %entry
+; CHECK-NEXT:    sub sp, x29, #80
+; CHECK-NEXT:    .cfi_def_cfa wsp, 96
+; CHECK-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] ; 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #96 ; 16-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w30
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    .cfi_restore w27
+; CHECK-NEXT:    .cfi_restore w28
+; CHECK-NEXT:    .cfi_restore b8
+; CHECK-NEXT:    .cfi_restore b9
+; CHECK-NEXT:    .cfi_restore b10
+; CHECK-NEXT:    .cfi_restore b11
+; CHECK-NEXT:    .cfi_restore b12
+; CHECK-NEXT:    .cfi_restore b13
+; CHECK-NEXT:    .cfi_restore b14
+; CHECK-NEXT:    .cfi_restore b15
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  LBB1_2: ; %entry
+; CHECK-NEXT:    .cfi_restore_state
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    bl ___stack_chk_fail
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    .loh AdrpLdrGotLdr Lloh3, Lloh4, Lloh5
+; CHECK-NEXT:    .loh AdrpLdrGotLdr Lloh0, Lloh1, Lloh2
+entry:
+  %uu = alloca [16 x float], align 256
+  call void @llvm.lifetime.start.p0(i64 64, ptr nonnull %uu) #5
+  call void @llvm.memset.p0.i64(ptr noundef nonnull align 256 dereferenceable(64) %uu, i8 0, i64 64, i1 false)
+  call void asm sideeffect "ptrue p0.s\0Ast1w { z0.s }, p0, [$0]\0A", "r"(ptr nonnull %uu) #5
+  call void @llvm.lifetime.end.p0(i64 64, ptr nonnull %uu) #5
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+
+; Function Attrs: mustprogress nocallback nofree nounwind willreturn memory(argmem: write)
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+
+attributes #0 = { mustprogress norecurse nounwind ssp uwtable(sync) "stack-protector-buffer-size"="8" "target-cpu"="apple-a16" "target-features"="+sme,+sme-f64f64,+sme2" }
+attributes #1 = { mustprogress norecurse nounwind ssp uwtable(sync) "aarch64_pstate_sm_enabled" "stack-protector-buffer-size"="8" "target-cpu"="apple-a16" "target-features"="+sme,+sme-f64f64,+sme2" }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #3 = { mustprogress nocallback nofree nounwind willreturn memory(argmem: write) }
+attributes #4 = { "aarch64_pstate_sm_enabled" "no-builtin-calloc" "no-builtin-stpcpy" }
+attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
index 35cbe65c6a8b86..fc5e640aed4aea 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
@@ -544,3 +544,119 @@ define %svboolx2 @and_of_multiuse_fcmp_olt_zero(<vscale x 4 x i1> %pg, <vscale x
   %ins.2 = insertvalue %svboolx2 %ins.1, <vscale x 4 x i1> %cmp, 1
   ret %svboolx2 %ins.2
 }
+
+define <vscale x 8 x i1> @logical_and_oeq_zero_pred(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: logical_and_oeq_zero_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp oeq <vscale x 8 x half> %x, zeroinitializer
+  %z = select <vscale x 8 x i1> %pg, <vscale x 8 x i1> %y, <vscale x 8 x i1> zeroinitializer
+ ret <vscale x 8 x i1> %z
+}
+
+define <vscale x 4 x i1> @logical_and_ogt_zero_pred(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: logical_and_ogt_zero_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmgt p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp ogt <vscale x 4 x half> %x, zeroinitializer
+  %z = select <vscale x 4 x i1> %pg, <vscale x 4 x i1> %y, <vscale x 4 x i1> zeroinitializer
+  ret <vscale x 4 x i1> %z
+}
+
+define <vscale x 2 x i1> @logical_and_oge_zero_pred(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: logical_and_oge_zero_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmge p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp oge <vscale x 2 x half> %x, zeroinitializer
+  %z = select <vscale x 2 x i1> %pg, <vscale x 2 x i1> %y, <vscale x 2 x i1> zeroinitializer
+  ret <vscale x 2 x i1> %z
+}
+
+define <vscale x 4 x i1> @logical_and_olt_zero_pred(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: logical_and_olt_zero_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmlt p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp olt <vscale x 4 x float> %x, zeroinitializer
+  %z = select <vscale x 4 x i1> %pg, <vscale x 4 x i1> %y, <vscale x 4 x i1> zeroinitializer
+  ret <vscale x 4 x i1> %z
+}
+
+define <vscale x 2 x i1> @logical_and_ole_zero_pred(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: logical_and_ole_zero_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmle p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp ole <vscale x 2 x float> %x, zeroinitializer
+  %z = select <vscale x 2 x i1> %pg, <vscale x 2 x i1> %y, <vscale x 2 x i1> zeroinitializer
+  ret <vscale x 2 x i1> %z
+}
+
+define <vscale x 2 x i1> @logical_and_une_zero_pred(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: logical_and_une_zero_pred:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcmne p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    ret
+  %y = fcmp une <vscale x 2 x double> %x, zeroinitializer
+  %z = select <vscale x 2 x i1> %pg, <vscale x 2 x i1> %y, <vscale x 2 x i1> zeroinitializer
+  ret <vscale x 2 x i1> %z
+}
+
+define %svboolx2 @logical_and_of_multiuse_fcmp_ogt(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: logical_and_of_multiuse_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    fcmgt p1.s, p1/z, z0.s, z1.s
+; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT:    ret
+  %cmp = fcmp ogt <vscale x 4 x float> %x, %y
+  %and = select <vscale x 4 x i1> %pg, <vscale x 4 x i1> %cmp, <vscale x 4 x i1> zeroinitializer
+  %ins.1 = insertvalue %svboolx2 poison, <vscale x 4 x i1> %and, 0
+  %ins.2 = insertvalue %svboolx2 %ins.1, <vscale x 4 x i1> %cmp, 1
+  ret %svboolx2 %ins.2
+}
+
+define %svboolx2 @logical_and_of_multiuse_fcmp_ogt_zero(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: logical_and_of_multiuse_fcmp_ogt_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    fcmgt p1.s, p1/z, z0.s, #0.0
+; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT:    ret
+  %cmp = fcmp ogt <vscale x 4 x float> %x, zeroinitializer
+  %and = select <vscale x 4 x i1> %pg, <vscale x 4 x i1> %cmp, <vscale x 4 x i1> zeroinitializer
+  %ins.1 = insertvalue %svboolx2 poison, <vscale x 4 x i1> %and, 0
+  %ins.2 = insertvalue %svboolx2 %ins.1, <vscale x 4 x i1> %cmp, 1
+  ret %svboolx2 %ins.2
+}
+
+define %svboolx2 @logical_and_of_multiuse_fcmp_olt(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: logical_and_of_multiuse_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    fcmgt p1.s, p1/z, z1.s, z0.s
+; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT:    ret
+  %cmp = fcmp olt <vscale x 4 x float> %x, %y
+  %and = select <vscale x 4 x i1> %pg, <vscale x 4 x i1> %cmp, <vscale x 4 x i1> zeroinitializer
+  %ins.1 = insertvalue %svboolx2 poison, <vscale x 4 x i1> %and, 0
+  %ins.2 = insertvalue %svboolx2 %ins.1, <vscale x 4 x i1> %cmp, 1
+  ret %svboolx2 %ins.2
+}
+
+define %svboolx2 @logical_and_of_multiuse_fcmp_olt_zero(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: logical_and_of_multiuse_fcmp_olt_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    fcmlt p1.s, p1/z, z0.s, #0.0
+; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
+; CHECK-NEXT:    ret
+  %cmp = fcmp olt <vscale x 4 x float> %x, zeroinitializer
+  %and = select <vscale x 4 x i1> %pg, <vscale x 4 x i1> %cmp, <vscale x 4 x i1> zeroinitializer
+  %ins.1 = insertvalue %svboolx2 poison, <vscale x 4 x i1> %and, 0
+  %ins.2 = insertvalue %svboolx2 %ins.1, <vscale x 4 x i1> %cmp, 1
+  ret %svboolx2 %ins.2
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index 0d7f2300626508..afe13851f0b953 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -24,8 +24,7 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z3.s
 ; CHECK-NEXT:    add z0.d, z2.d, z1.d
-; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    and p2.b, p1/z, p1.b, p2.b
+; CHECK-NEXT:    bic p2.b, p1/z, p1.b, p2.b
 ; CHECK-NEXT:    mov z0.d, p2/m, z2.d
 ; CHECK-NEXT:    sel z0.d, p1, z0.d, z2.d
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
new file mode 100644
index 00000000000000..84c15e4fbc33c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i8> @test_compress_nxv2i8(<vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i8> @llvm.experimental.vector.compress(<vscale x 2 x i8> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+    ret <vscale x 2 x i8> %out
+}
+
+define <vscale x 2 x i16> @test_compress_nxv2i16(<vscale x 2 x i16> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i16> @llvm.experimental.vector.compress(<vscale x 2 x i16> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+    ret <vscale x 2 x i16> %out
+}
+
+define <vscale x 2 x i32> @test_compress_nxv2i32(<vscale x 2 x i32> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i32> @llvm.experimental.vector.compress(<vscale x 2 x i32> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+    ret <vscale x 2 x i32> %out
+}
+
+define <vscale x 2 x i64> @test_compress_nxv2i64(<vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x i64> @llvm.experimental.vector.compress(<vscale x 2 x i64> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+    ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x float> @test_compress_nxv2f32(<vscale x 2 x float> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x float> @llvm.experimental.vector.compress(<vscale x 2 x float> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+    ret <vscale x 2 x float> %out
+}
+
+define <vscale x 2 x double> @test_compress_nxv2f64(<vscale x 2 x double> %vec, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 2 x double> @llvm.experimental.vector.compress(<vscale x 2 x double> %vec, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+    ret <vscale x 2 x double> %out
+}
+
+define <vscale x 4 x i8> @test_compress_nxv4i8(<vscale x 4 x i8> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i8> @llvm.experimental.vector.compress(<vscale x 4 x i8> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+    ret <vscale x 4 x i8> %out
+}
+
+define <vscale x 4 x i16> @test_compress_nxv4i16(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i16> @llvm.experimental.vector.compress(<vscale x 4 x i16> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+    ret <vscale x 4 x i16> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x float> @test_compress_nxv4f32(<vscale x 4 x float> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x float> @llvm.experimental.vector.compress(<vscale x 4 x float> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+    ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x i4> @test_compress_illegal_element_type(<vscale x 4 x i4> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_illegal_element_type:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i4> @llvm.experimental.vector.compress(<vscale x 4 x i4> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i4> undef)
+    ret <vscale x 4 x i4> %out
+}
+
+define <vscale x 8 x i32> @test_compress_large(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: test_compress_large:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    cnth x9
+; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    sub x9, x9, #1
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    compact z0.s, p2, z0.s
+; CHECK-NEXT:    cntp x8, p1, p2.s
+; CHECK-NEXT:    compact z1.s, p0, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [sp]
+; CHECK-NEXT:    mov w8, w8
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    st1w { z1.s }, p1, [x9, x8, lsl #2]
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [sp]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #2
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <vscale x 8 x i32> @llvm.experimental.vector.compress(<vscale x 8 x i32> %vec, <vscale x 8 x i1> %mask, <vscale x 8 x i32> undef)
+    ret <vscale x 8 x i32> %out
+}
+
+; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
+; the second vector input register to the ret register or doing nothing.
+define <vscale x 4 x i32> @test_compress_const_splat1_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat1_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> splat (i1 -1), <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+define <vscale x 4 x i32> @test_compress_const_splat0_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> splat (i1 0), <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+define <vscale x 4 x i32> @test_compress_undef_mask(<vscale x 4 x i32> %ignore, <vscale x 4 x i32> %vec) {
+; CHECK-LABEL: test_compress_undef_mask:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> undef, <vscale x 4 x i32> undef)
+    ret <vscale x 4 x i32> %out
+}
+
+define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4i32_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    shl v1.4s, v1.4s, #31
+; CHECK-NEXT:    cmlt v1.4s, v1.4s, #0
+; CHECK-NEXT:    and z1.s, z1.s, #0x1
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) {
+; CHECK-LABEL: test_compress_v1i32_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    sbfx w8, w0, #0, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    mov v1.s[0], w8
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+    %out = call <1 x i32> @llvm.experimental.vector.compress(<1 x i32> %vec, <1 x i1> %mask, <1 x i32> undef)
+    ret <1 x i32> %out
+}
+
+define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4f64_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    ushll v3.2d, v2.2s, #0
+; CHECK-NEXT:    ushll2 v4.2d, v2.4s, #0
+; CHECK-NEXT:    fmov x8, d2
+; CHECK-NEXT:    shl v3.2d, v3.2d, #63
+; CHECK-NEXT:    shl v4.2d, v4.2d, #63
+; CHECK-NEXT:    lsr x9, x8, #32
+; CHECK-NEXT:    eor w8, w8, w9
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    cmlt v3.2d, v3.2d, #0
+; CHECK-NEXT:    cmlt v4.2d, v4.2d, #0
+; CHECK-NEXT:    and x8, x8, #0x3
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    and z3.d, z3.d, #0x1
+; CHECK-NEXT:    and z4.d, z4.d, #0x1
+; CHECK-NEXT:    cmpne p1.d, p0/z, z3.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z4.d, #0
+; CHECK-NEXT:    compact z0.d, p1, z0.d
+; CHECK-NEXT:    compact z1.d, p0, z1.d
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
+; CHECK-NEXT:    ret
+    %out = call <4 x double> @llvm.experimental.vector.compress(<4 x double> %vec, <4 x i1> %mask, <4 x double> undef)
+    ret <4 x double> %out
+}
+
+define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compress_v2i16_with_sve:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    and z1.d, z1.d, #0x1
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    compact z0.d, p0, z0.d
+; CHECK-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEXT:    ret
+    %out = call <2 x i16> @llvm.experimental.vector.compress(<2 x i16> %vec, <2 x i1> %mask, <2 x i16> undef)
+    ret <2 x i16> %out
+}
+
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_nxv4i32_with_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> %passthru)
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_zero_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32_with_zero_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> splat(i32 0))
+    ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @test_compress_nxv4i32_with_const_passthru(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: test_compress_nxv4i32_with_const_passthru:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cntp x8, p0, p0.s
+; CHECK-NEXT:    compact z0.s, p0, z0.s
+; CHECK-NEXT:    mov z1.s, #5 // =0x5
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    ret
+    %out = call <vscale x 4 x i32> @llvm.experimental.vector.compress(<vscale x 4 x i32> %vec, <vscale x 4 x i1> %mask, <vscale x 4 x i32> splat(i32 5))
+    ret <vscale x 4 x i32> %out
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 014534ab79fe64..0ff633fb4d8bec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -49,10 +49,11 @@ body: |
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
     ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
-    ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
+    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: fceil_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
@@ -89,8 +90,9 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
-    ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
+    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: fceil_s16_vs
     ; GFX11-FAKE16: liveins: $sgpr0
@@ -126,10 +128,11 @@ body: |
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
     ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
-    ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
+    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index dcf9e169f586be..fc8a6aaa17512b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -58,10 +58,11 @@ body: |
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
     ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
-    ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
+    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: ffloor_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
@@ -98,8 +99,9 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
-    ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
+    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: ffloor_s16_vs
     ; GFX11-FAKE16: liveins: $sgpr0
@@ -135,10 +137,11 @@ body: |
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
     ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
-    ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
+    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: ffloor_fneg_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 970bb08e1838b2..d9ce1e4efe0e50 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -17180,11 +17180,17 @@ define bfloat @v_fabs_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fabs_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fabs_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 0x7fff, v0.l
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fabs_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.fabs.bf16(bfloat %a)
   ret bfloat %op
 }
@@ -17266,11 +17272,17 @@ define bfloat @v_fneg_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fneg_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fneg_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fneg bfloat %a
   ret bfloat %op
 }
@@ -17365,11 +17377,17 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_or_b32_e32 v0, 0x8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fneg_fabs_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fneg_fabs_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_or_b16 v0.l, 0x8000, v0.l
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fneg_fabs_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
   %op = fneg bfloat %fabs
   ret bfloat %op
@@ -25005,40 +25023,22 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11TRUE16-LABEL: v_frexp_bf16_i16:
-; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; GFX11TRUE16-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11TRUE16-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
-; GFX11TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, v2
-; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11FAKE16-LABEL: v_frexp_bf16_i16:
-; GFX11FAKE16:       ; %bb.0:
-; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_frexp_mant_f32_e32 v0, v1
-; GFX11FAKE16-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11FAKE16-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11FAKE16-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: v_frexp_bf16_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v1
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call { bfloat, i16 } @llvm.frexp.bf16.i16(bfloat %a)
   ret { bfloat, i16 } %op
 }
@@ -28662,17 +28662,29 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <2 x bfloat> %x to <2 x i16>
   ret <2 x i16> %op
 }
@@ -28753,19 +28765,33 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v2, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <3 x bfloat> %x to <3 x i16>
   ret <3 x i16> %op
 }
@@ -28867,23 +28893,41 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v3, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v2, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = fptosi <4 x bfloat> %x to <4 x i16>
   ret <4 x i16> %op
 }
@@ -34518,15 +34562,25 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_select_fneg_lhs_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_xor_b16 v1.l, 0x8000, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %neg.a = fneg bfloat %a
   %op = select i1 %cond, bfloat %neg.a, bfloat %b
   ret bfloat %op
@@ -34582,15 +34636,25 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_select_fneg_rhs_bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_xor_b16 v2.l, 0x8000, v2.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc_lo
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %neg.b = fneg bfloat %b
   %op = select i1 %cond, bfloat %a, bfloat %neg.b
   ret bfloat %op
@@ -34673,18 +34737,31 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_select_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_select_v2bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_select_v2bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v4, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
 }
@@ -34768,20 +34845,35 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_vselect_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_vselect_v2bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_vselect_v2bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
   ret <2 x bfloat> %op
 }
@@ -34936,20 +35028,36 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_select_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
-; GFX11-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_select_v2bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_select_v2bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
+; GFX11FAKE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s3, v1, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s1, v2, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11FAKE16-NEXT:    ; return to shader part epilog
   %cond = icmp eq i32 %c, 0
   %op = select i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b
   %cast = bitcast <2 x bfloat> %op to i32
@@ -35038,21 +35146,39 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_vselect_v2bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s1, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_vselect_v2bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_vselect_v2bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
+; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s0, v2, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s1, v3, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11FAKE16-NEXT:    ; return to shader part epilog
   %cond = icmp eq <2 x i32> %c, zeroinitializer
   %op = select <2 x i1> %cond, <2 x bfloat> %a, <2 x bfloat> %b
   %cast = bitcast <2 x bfloat> %op to i32
@@ -36740,33 +36866,64 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
-; GFX11-LABEL: s_vselect_v4bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s4, s1, 16
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
-; GFX11-NEXT:    v_mov_b32_e32 v4, s5
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_mov_b32_e32 v6, s0
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_vselect_v4bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s4
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s5
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s0
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_vselect_v4bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX11FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1
+; GFX11FAKE16-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX11FAKE16-NEXT:    s_lshr_b32 s5, s0, 16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, s4, v4, vcc_lo
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v4, s5
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11FAKE16-NEXT:    v_mov_b32_e32 v6, s0
+; GFX11FAKE16-NEXT:    s_lshr_b32 s0, s2, 16
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instid1(SALU_CYCLE_1)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, s0, v4, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, s2, v6, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, s3, v5, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11FAKE16-NEXT:    ; return to shader part epilog
   %cond = icmp eq <4 x i32> %c, zeroinitializer
   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
   %cast = bitcast <4 x bfloat> %op to <2 x i32>
@@ -36910,29 +37067,57 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_vselect_v4bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_vselect_v4bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v6, v4, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_vselect_v4bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v2, v7, v5 :: v_dual_and_b32 v3, 1, v3
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v0, v6, v4 :: v_dual_and_b32 v1, 1, v1
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v9, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc_lo
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <4 x i1> %cond, <4 x bfloat> %a, <4 x bfloat> %b
   ret <4 x bfloat> %op
 }
@@ -37172,47 +37357,97 @@ define <8 x bfloat> @v_vselect_v8bf16(<8 x i1> %cond, <8 x bfloat> %a, <8 x bflo
 ; GFX10-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_vselect_v8bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_vselect_v8bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v16.l, v15.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v11.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v14, v10, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, v13.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v14.l, v9.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v16, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v10, v14, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v12
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v8
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v10, v14, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_vselect_v8bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v10
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v14
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v6, v15, v11 :: v_dual_and_b32 v1, 1, v1
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 16, v11
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v4, v14, v10 :: v_dual_and_b32 v3, 1, v3
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v14, 16, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v17, v16, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v2, v13, v9 :: v_dual_and_b32 v7, 1, v7
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v12, v8, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v14, v10, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v13, v9, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v15, v11, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <8 x i1> %cond, <8 x bfloat> %a, <8 x bfloat> %b
   ret <8 x bfloat> %op
 }
@@ -37777,85 +38012,186 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX10-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_vselect_v16bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_vselect_v16bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v49.l, v26.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v50.l, v18.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v33.l, v30.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v34.l, v22.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v53.l, v24.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v54.l, v16.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v35.l, v29.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v36.l, v21.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v51.l, v25.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v33, v34, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v52.l, v17.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v17, 16, v17
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v25, 16, v25
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v37.l, v28.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v38.l, v20.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v35, v36, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v39.l, v27.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v48.l, v19.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v19, 16, v19
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v37, v38, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v27, 16, v27
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v21, 16, v21
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v39, v48, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v49, v50, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v32.l, v23.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v23, 16, v23
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v51, v52, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v53, v54, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v30, v22, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v29, v21, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v28, v20, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v27, v19, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v24, v16, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v25, v17, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v26, v18, vcc_lo
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v16, 16, v31
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v17.l, v31.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, v16.l
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v14, v17, v32 :: v_dual_and_b32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v3, v23, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_vselect_v16bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v19
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v27
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v18
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v26
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v12, v30, v22 :: v_dual_and_b32 v11, 1, v11
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v21
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v29
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v16
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v34, v33, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v24
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v29, v21, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v20
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v17
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v36, v35, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v25
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v23
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v28, v20, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v38, v37, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v27, v19, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v4, v26, v18 :: v_dual_and_b32 v7, 1, v7
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v25, v17, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v52, v51, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v24, v16, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v54, v53, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v50, v49, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v48, v39, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v14, v31, v23, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v3, v32, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <16 x i1> %cond, <16 x bfloat> %a, <16 x bfloat> %b
   ret <16 x bfloat> %op
 }
@@ -39279,205 +39615,455 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_vselect_v32bf16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_clause 0x1f
-; GFX11-NEXT:    scratch_load_u16 v31, off, s32
-; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:128
-; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:64
-; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:124
-; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:60
-; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:120
-; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:56
-; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:116
-; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:52
-; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:112
-; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:48
-; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:108
-; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:44
-; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:104
-; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:40
-; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:100
-; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:36
-; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:96
-; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:32
-; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:92
-; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:28
-; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:88
-; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:24
-; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:84
-; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:20
-; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:80
-; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:16
-; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:76
-; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:12
-; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:72
-; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:8
-; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:68
-; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:4
-; GFX11-NEXT:    v_and_b32_e32 v30, 1, v30
-; GFX11-NEXT:    v_and_b32_e32 v28, 1, v28
-; GFX11-NEXT:    v_and_b32_e32 v26, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v24, 1, v24
-; GFX11-NEXT:    v_and_b32_e32 v22, 1, v22
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX11-NEXT:    v_and_b32_e32 v20, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX11-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX11-NEXT:    s_waitcnt vmcnt(30)
-; GFX11-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
-; GFX11-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(28)
-; GFX11-NEXT:    v_cndmask_b32_e32 v28, v34, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11-NEXT:    v_and_b32_e32 v7, 1, v7
-; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
-; GFX11-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
-; GFX11-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11-NEXT:    s_waitcnt vmcnt(26)
-; GFX11-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX11-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
-; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
-; GFX11-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX11-NEXT:    s_waitcnt vmcnt(24)
-; GFX11-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11-NEXT:    v_and_b32_e32 v11, 1, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
-; GFX11-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
-; GFX11-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX11-NEXT:    s_waitcnt vmcnt(22)
-; GFX11-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11-NEXT:    v_and_b32_e32 v9, 1, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
-; GFX11-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
-; GFX11-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(20)
-; GFX11-NEXT:    v_cndmask_b32_e32 v20, v50, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11-NEXT:    v_and_b32_e32 v15, 1, v15
-; GFX11-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
-; GFX11-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
-; GFX11-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX11-NEXT:    s_waitcnt vmcnt(18)
-; GFX11-NEXT:    v_cndmask_b32_e32 v18, v52, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11-NEXT:    v_and_b32_e32 v13, 1, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
-; GFX11-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
-; GFX11-NEXT:    v_and_b32_e32 v12, 1, v12
-; GFX11-NEXT:    s_waitcnt vmcnt(16)
-; GFX11-NEXT:    v_cndmask_b32_e32 v16, v54, v55, vcc_lo
-; GFX11-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
-; GFX11-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
-; GFX11-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX11-NEXT:    s_waitcnt vmcnt(14)
-; GFX11-NEXT:    v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX11-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
-; GFX11-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
-; GFX11-NEXT:    s_waitcnt vmcnt(12)
-; GFX11-NEXT:    v_cndmask_b32_e32 v12, v66, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11-NEXT:    v_and_b32_e32 v23, 1, v23
-; GFX11-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
-; GFX11-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
-; GFX11-NEXT:    s_waitcnt vmcnt(10)
-; GFX11-NEXT:    v_cndmask_b32_e32 v10, v68, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v21, 1, v21
-; GFX11-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
-; GFX11-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v70, v71, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v27, 1, v27
-; GFX11-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
-; GFX11-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_cndmask_b32_e32 v6, v80, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v25, 1, v25
-; GFX11-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
-; GFX11-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v82, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11-NEXT:    v_and_b32_e32 v31, 1, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
-; GFX11-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
-; GFX11-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v84, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v29, 1, v29
-; GFX11-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
-; GFX11-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v86, v87, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
-; GFX11-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
-; GFX11-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v84, v85, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v86, v87, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
-; GFX11-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11TRUE16-LABEL: v_vselect_v32bf16:
+; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    s_clause 0x1f
+; GFX11TRUE16-NEXT:    scratch_load_u16 v31, off, s32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v32, off, s32 offset:128
+; GFX11TRUE16-NEXT:    scratch_load_b32 v33, off, s32 offset:64
+; GFX11TRUE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX11TRUE16-NEXT:    scratch_load_b32 v35, off, s32 offset:60
+; GFX11TRUE16-NEXT:    scratch_load_b32 v36, off, s32 offset:120
+; GFX11TRUE16-NEXT:    scratch_load_b32 v37, off, s32 offset:56
+; GFX11TRUE16-NEXT:    scratch_load_b32 v38, off, s32 offset:116
+; GFX11TRUE16-NEXT:    scratch_load_b32 v39, off, s32 offset:52
+; GFX11TRUE16-NEXT:    scratch_load_b32 v48, off, s32 offset:112
+; GFX11TRUE16-NEXT:    scratch_load_b32 v49, off, s32 offset:48
+; GFX11TRUE16-NEXT:    scratch_load_b32 v50, off, s32 offset:108
+; GFX11TRUE16-NEXT:    scratch_load_b32 v51, off, s32 offset:44
+; GFX11TRUE16-NEXT:    scratch_load_b32 v52, off, s32 offset:104
+; GFX11TRUE16-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11TRUE16-NEXT:    scratch_load_b32 v54, off, s32 offset:100
+; GFX11TRUE16-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11TRUE16-NEXT:    scratch_load_b32 v64, off, s32 offset:96
+; GFX11TRUE16-NEXT:    scratch_load_b32 v65, off, s32 offset:32
+; GFX11TRUE16-NEXT:    scratch_load_b32 v66, off, s32 offset:92
+; GFX11TRUE16-NEXT:    scratch_load_b32 v67, off, s32 offset:28
+; GFX11TRUE16-NEXT:    scratch_load_b32 v68, off, s32 offset:88
+; GFX11TRUE16-NEXT:    scratch_load_b32 v69, off, s32 offset:24
+; GFX11TRUE16-NEXT:    scratch_load_b32 v70, off, s32 offset:84
+; GFX11TRUE16-NEXT:    scratch_load_b32 v71, off, s32 offset:20
+; GFX11TRUE16-NEXT:    scratch_load_b32 v80, off, s32 offset:80
+; GFX11TRUE16-NEXT:    scratch_load_b32 v81, off, s32 offset:16
+; GFX11TRUE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11TRUE16-NEXT:    scratch_load_b32 v83, off, s32 offset:12
+; GFX11TRUE16-NEXT:    scratch_load_b32 v84, off, s32 offset:72
+; GFX11TRUE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
+; GFX11TRUE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
+; GFX11TRUE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(31)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v96.l, v32.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v33.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(29)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v98.l, v34.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v35.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(27)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v100.l, v36.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v101.l, v37.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(25)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v102.l, v38.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v103.l, v39.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(23)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v112.l, v48.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v113.l, v49.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(21)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v114.l, v50.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v115.l, v51.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(19)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v116.l, v52.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v117.l, v53.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(17)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v118.l, v54.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v119.l, v55.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(15)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v128.l, v64.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v129.l, v65.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(13)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v130.l, v66.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v131.l, v67.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(11)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v132.l, v68.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v133.l, v69.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(9)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v134.l, v70.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v135.l, v71.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(5)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v146.l, v82.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v147.l, v83.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(3)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v96, v96, v97, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v97.l, v85.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v98, v98, v99, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v144.l, v80.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e64 v145.l, v81.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v26, v100, v101, vcc_lo
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v24, v102, v103, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v22, v112, v113, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v20, v114, v115, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v18, v116, v117, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v16, v118, v119, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v19, 1, v19
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v14, v128, v129, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v12, v130, v131, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v10, v132, v133, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v8, v134, v135, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v6, v144, v145, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v4, v146, v147, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
+; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v99.l, v87.l
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v30, v97, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v30.l, v84.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v84.l, v85.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v28, v99, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v28.l, v86.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v85.l, v87.l
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v30, v84, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v28, v85, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
+; GFX11TRUE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v14, v29, v98, 0x5040100
+; GFX11TRUE16-NEXT:    v_perm_b32 v15, v31, v96, 0x5040100
+; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11FAKE16-LABEL: v_vselect_v32bf16:
+; GFX11FAKE16:       ; %bb.0:
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11FAKE16-NEXT:    s_clause 0x1f
+; GFX11FAKE16-NEXT:    scratch_load_u16 v31, off, s32
+; GFX11FAKE16-NEXT:    scratch_load_b32 v32, off, s32 offset:128
+; GFX11FAKE16-NEXT:    scratch_load_b32 v33, off, s32 offset:64
+; GFX11FAKE16-NEXT:    scratch_load_b32 v34, off, s32 offset:124
+; GFX11FAKE16-NEXT:    scratch_load_b32 v35, off, s32 offset:60
+; GFX11FAKE16-NEXT:    scratch_load_b32 v36, off, s32 offset:120
+; GFX11FAKE16-NEXT:    scratch_load_b32 v37, off, s32 offset:56
+; GFX11FAKE16-NEXT:    scratch_load_b32 v38, off, s32 offset:116
+; GFX11FAKE16-NEXT:    scratch_load_b32 v39, off, s32 offset:52
+; GFX11FAKE16-NEXT:    scratch_load_b32 v48, off, s32 offset:112
+; GFX11FAKE16-NEXT:    scratch_load_b32 v49, off, s32 offset:48
+; GFX11FAKE16-NEXT:    scratch_load_b32 v50, off, s32 offset:108
+; GFX11FAKE16-NEXT:    scratch_load_b32 v51, off, s32 offset:44
+; GFX11FAKE16-NEXT:    scratch_load_b32 v52, off, s32 offset:104
+; GFX11FAKE16-NEXT:    scratch_load_b32 v53, off, s32 offset:40
+; GFX11FAKE16-NEXT:    scratch_load_b32 v54, off, s32 offset:100
+; GFX11FAKE16-NEXT:    scratch_load_b32 v55, off, s32 offset:36
+; GFX11FAKE16-NEXT:    scratch_load_b32 v64, off, s32 offset:96
+; GFX11FAKE16-NEXT:    scratch_load_b32 v65, off, s32 offset:32
+; GFX11FAKE16-NEXT:    scratch_load_b32 v66, off, s32 offset:92
+; GFX11FAKE16-NEXT:    scratch_load_b32 v67, off, s32 offset:28
+; GFX11FAKE16-NEXT:    scratch_load_b32 v68, off, s32 offset:88
+; GFX11FAKE16-NEXT:    scratch_load_b32 v69, off, s32 offset:24
+; GFX11FAKE16-NEXT:    scratch_load_b32 v70, off, s32 offset:84
+; GFX11FAKE16-NEXT:    scratch_load_b32 v71, off, s32 offset:20
+; GFX11FAKE16-NEXT:    scratch_load_b32 v80, off, s32 offset:80
+; GFX11FAKE16-NEXT:    scratch_load_b32 v81, off, s32 offset:16
+; GFX11FAKE16-NEXT:    scratch_load_b32 v82, off, s32 offset:76
+; GFX11FAKE16-NEXT:    scratch_load_b32 v83, off, s32 offset:12
+; GFX11FAKE16-NEXT:    scratch_load_b32 v84, off, s32 offset:72
+; GFX11FAKE16-NEXT:    scratch_load_b32 v85, off, s32 offset:8
+; GFX11FAKE16-NEXT:    scratch_load_b32 v86, off, s32 offset:68
+; GFX11FAKE16-NEXT:    scratch_load_b32 v87, off, s32 offset:4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v30, 1, v30
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v26, 1, v26
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v30
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(30)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v28
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(28)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v28, v34, v35, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v26
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 1, v7
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v35, 16, v35
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v34, 16, v34
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(26)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v26, v36, v37, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v24
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v37
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v36, 16, v36
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(24)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v24, v38, v39, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v22
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v11, 1, v11
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v39, 16, v39
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v38, 16, v38
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(22)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v48, v49, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v20
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v49, 16, v49
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v48, 16, v48
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(20)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v20, v50, v51, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v18
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 1, v15
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v51, 16, v51
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v50, 16, v50
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(18)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v18, v52, v53, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v16
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v53, 16, v53
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v52, 16, v52
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(16)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v16, v54, v55, vcc_lo
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v55, 16, v55
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v54, 16, v54
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v14
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(14)
+; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v12
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v17, 1, v17
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v65, 16, v65
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v64, 16, v64
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(12)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v12, v66, v67, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v23, 1, v23
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v67, 16, v67
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v66, 16, v66
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(10)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v68, v69, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v8
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v69, 16, v69
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v68, 16, v68
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(8)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v8, v70, v71, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v6
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v27, 1, v27
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v71, 16, v71
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v70, 16, v70
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(6)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v6, v80, v81, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v4
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v25, 1, v25
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v81, 16, v81
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v80, 16, v80
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(4)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v4, v82, v83, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v31, 1, v31
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v83, 16, v83
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v82, 16, v82
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(2)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v2, v84, v85, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v29, 1, v29
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v85, 16, v85
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v84, 16, v84
+; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v0, v86, v87, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v31
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v87, 16, v87
+; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v86, 16, v86
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v29, v34, v35, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v27
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v25
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v23
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v23, v48, v49, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v21
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v19
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v17
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v15
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v15, v64, v65, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v13
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v13, v66, v67, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v11
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v11, v68, v69, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v7
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v7, v80, v81, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v3, v84, v85, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v86, v87, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v5
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v5, v82, v83, vcc_lo
+; GFX11FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v9
+; GFX11FAKE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v3, v7, v6, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v6, v13, v12, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v2, v5, v4, 0x5040100
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v9, v70, v71, vcc_lo
+; GFX11FAKE16-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v7, v15, v14, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v10, v21, v20, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v11, v23, v22, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v8, v17, v16, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v9, v19, v18, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v12, v25, v24, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v13, v27, v26, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v14, v29, v28, 0x5040100
+; GFX11FAKE16-NEXT:    v_perm_b32 v15, v31, v30, 0x5040100
+; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = select <32 x i1> %cond, <32 x bfloat> %a, <32 x bfloat> %b
   ret <32 x bfloat> %op
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index 9fe7544003568c..1094b768f1bd16 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -100,9 +100,7 @@ define amdgpu_kernel void @fadd_f16(
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_load_u16 v1, off, s[0:3], 0 glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b16_e32 v0.h, v1.l
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v0.h
+; GFX11-GISEL-NEXT:    v_add_f16_e32 v0.l, v0.l, v1.l
 ; GFX11-GISEL-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 089542d2237baa..5cb45ea9a30716 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -288,15 +288,15 @@
 ; GCN-O1-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-NEXT:        LCSSA Verifier
 ; GCN-O1-NEXT:        Loop-Closed SSA Form Pass
+; GCN-O1-NEXT:      DummyCGSCCPass
+; GCN-O1-NEXT:      FunctionPass Manager
+; GCN-O1-NEXT:        Dominator Tree Construction
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-NEXT:        Function Alias Analysis Results
 ; GCN-O1-NEXT:        ObjC ARC contraction
-; GCN-O1-NEXT:      DummyCGSCCPass
-; GCN-O1-NEXT:      FunctionPass Manager
 ; GCN-O1-NEXT:        Prepare callbr
 ; GCN-O1-NEXT:        Safe Stack instrumentation pass
 ; GCN-O1-NEXT:        Insert stack protectors
-; GCN-O1-NEXT:        Dominator Tree Construction
 ; GCN-O1-NEXT:        Cycle Info Analysis
 ; GCN-O1-NEXT:        Uniformity Analysis
 ; GCN-O1-NEXT:        Basic Alias Analysis (stateless AA impl)
@@ -589,15 +589,15 @@
 ; GCN-O1-OPTS-NEXT:        AMDGPU Rewrite Undef for PHI
 ; GCN-O1-OPTS-NEXT:        LCSSA Verifier
 ; GCN-O1-OPTS-NEXT:        Loop-Closed SSA Form Pass
+; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
+; GCN-O1-OPTS-NEXT:      FunctionPass Manager
+; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O1-OPTS-NEXT:        Function Alias Analysis Results
 ; GCN-O1-OPTS-NEXT:        ObjC ARC contraction
-; GCN-O1-OPTS-NEXT:      DummyCGSCCPass
-; GCN-O1-OPTS-NEXT:      FunctionPass Manager
 ; GCN-O1-OPTS-NEXT:        Prepare callbr
 ; GCN-O1-OPTS-NEXT:        Safe Stack instrumentation pass
 ; GCN-O1-OPTS-NEXT:        Insert stack protectors
-; GCN-O1-OPTS-NEXT:        Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        Cycle Info Analysis
 ; GCN-O1-OPTS-NEXT:        Uniformity Analysis
 ; GCN-O1-OPTS-NEXT:        Basic Alias Analysis (stateless AA impl)
@@ -904,17 +904,15 @@
 ; GCN-O2-NEXT:        LCSSA Verifier
 ; GCN-O2-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O2-NEXT:      Analysis if a function is memory bound
+; GCN-O2-NEXT:      DummyCGSCCPass
 ; GCN-O2-NEXT:      FunctionPass Manager
 ; GCN-O2-NEXT:        Dominator Tree Construction
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O2-NEXT:        Function Alias Analysis Results
 ; GCN-O2-NEXT:        ObjC ARC contraction
-; GCN-O2-NEXT:      DummyCGSCCPass
-; GCN-O2-NEXT:      FunctionPass Manager
 ; GCN-O2-NEXT:        Prepare callbr
 ; GCN-O2-NEXT:        Safe Stack instrumentation pass
 ; GCN-O2-NEXT:        Insert stack protectors
-; GCN-O2-NEXT:        Dominator Tree Construction
 ; GCN-O2-NEXT:        Cycle Info Analysis
 ; GCN-O2-NEXT:        Uniformity Analysis
 ; GCN-O2-NEXT:        Basic Alias Analysis (stateless AA impl)
@@ -1234,17 +1232,15 @@
 ; GCN-O3-NEXT:        LCSSA Verifier
 ; GCN-O3-NEXT:        Loop-Closed SSA Form Pass
 ; GCN-O3-NEXT:      Analysis if a function is memory bound
+; GCN-O3-NEXT:      DummyCGSCCPass
 ; GCN-O3-NEXT:      FunctionPass Manager
 ; GCN-O3-NEXT:        Dominator Tree Construction
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
 ; GCN-O3-NEXT:        Function Alias Analysis Results
 ; GCN-O3-NEXT:        ObjC ARC contraction
-; GCN-O3-NEXT:      DummyCGSCCPass
-; GCN-O3-NEXT:      FunctionPass Manager
 ; GCN-O3-NEXT:        Prepare callbr
 ; GCN-O3-NEXT:        Safe Stack instrumentation pass
 ; GCN-O3-NEXT:        Insert stack protectors
-; GCN-O3-NEXT:        Dominator Tree Construction
 ; GCN-O3-NEXT:        Cycle Info Analysis
 ; GCN-O3-NEXT:        Uniformity Analysis
 ; GCN-O3-NEXT:        Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 5cf457d1753b30..4faa482ede59f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -165,12 +165,10 @@ define amdgpu_kernel void @ceil_v2f16(
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_ceil_f16_e32 v0.l, v0.l
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-NEXT:    v_ceil_f16_e32 v0.h, v1.l
 ; GFX11-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_ceil_f16_e32 v0.h, v0.h
 ; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index ece55c7f7dceaa..61f6c9f7f0e6ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -166,12 +166,10 @@ define amdgpu_kernel void @floor_v2f16(
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_floor_f16_e32 v0.l, v0.l
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-NEXT:    v_floor_f16_e32 v0.h, v1.l
 ; GFX11-NEXT:    v_mov_b16_e32 v1.l, v0.l
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_floor_f16_e32 v0.h, v0.h
 ; GFX11-NEXT:    v_mov_b16_e32 v0.l, v0.h
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index b2b5153bb6c2a2..72e86f1f6f9992 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -2,12 +2,14 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG-FAKE16 %s
 
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL-FAKE16 %s
 
 define float @test_ldexp_f32_i32(ptr addrspace(1) %out, float %a, i32 %b) {
 ; GFX6-LABEL: test_ldexp_f32_i32:
@@ -211,13 +213,22 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_sdwa v0, v0, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_f16_i8:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i8:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i8:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_f16_i8:
 ; GFX6-GISEL:       ; %bb.0:
@@ -248,15 +259,25 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_f16_i8:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i8:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_f16_i8:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.ldexp.f16.i8(half %a, i8 %b)
   ret half %result
 }
@@ -283,11 +304,19 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: test_ldexp_f16_i16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_f16_i16:
 ; GFX6-GISEL:       ; %bb.0:
@@ -297,6 +326,18 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
 ; GFX6-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_f16_i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.ldexp.f16.i16(half %a, i16 %b)
   ret half %result
 }
@@ -328,14 +369,23 @@ define half @test_ldexp_f16_i32(half %a, i32 %b) {
 ; GFX9-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_f16_i32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i32:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_f16_i32:
 ; GFX6-GISEL:       ; %bb.0:
@@ -363,14 +413,23 @@ define half @test_ldexp_f16_i32(half %a, i32 %b) {
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_f16_i32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i32:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_f16_i32:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call half @llvm.ldexp.f16.i32(half %a, i32 %b)
   ret half %result
 }
@@ -411,19 +470,36 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v3, v2
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v3.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v1, v1, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v3, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
@@ -460,21 +536,40 @@ define <2 x half> @test_ldexp_v2f16_v2i32(<2 x half> %a, <2 x i32> %b) {
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
-; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v4, v2
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v2f16_v2i32:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v4, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x half> @llvm.ldexp.v2f16.v2i32(<2 x half> %a, <2 x i32> %b)
   ret <2 x half> %result
 }
@@ -509,16 +604,30 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v3, v2
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v3.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v3, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v2f16_v2i16:
 ; GFX6-GISEL:       ; %bb.0:
@@ -549,18 +658,34 @@ define <2 x half> @test_ldexp_v2f16_v2i16(<2 x half> %a, <2 x i16> %b) {
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v2, v3
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v2f16_v2i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v2.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v2, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x half> @llvm.ldexp.v2f16.v2i16(<2 x half> %a, <2 x i16> %b)
   ret <2 x half> %result
 }
@@ -608,21 +733,40 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SDAG-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v3, v5, v3
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v5.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v3, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v3, v5, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v2, v4, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v3f16_v3i32:
 ; GFX6-GISEL:       ; %bb.0:
@@ -666,23 +810,44 @@ define <3 x half> @test_ldexp_v3f16_v3i32(<3 x half> %a, <3 x i32> %b) {
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7fff
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v5
-; GFX11-GISEL-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v5
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v6, v3
-; GFX11-GISEL-NEXT:    v_med3_i32 v3, 0xffff8000, v4, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v3f16_v3i32:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v5
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v5
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v4, v5
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v5, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v6, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v3, 0xffff8000, v4, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x half> @llvm.ldexp.v3f16.v3i32(<3 x half> %a, <3 x i32> %b)
   ret <3 x half> %result
 }
@@ -723,17 +888,32 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v5, v4
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v5.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v5, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v3f16_v3i16:
 ; GFX6-GISEL:       ; %bb.0:
@@ -770,19 +950,36 @@ define <3 x half> @test_ldexp_v3f16_v3i16(<3 x half> %a, <3 x i16> %b) {
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v4, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v3f16_v3i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v4.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v4, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <3 x half> @llvm.ldexp.v3f16.v3i16(<3 x half> %a, <3 x i16> %b)
   ret <3 x half> %result
 }
@@ -839,26 +1036,53 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i32:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_movk_i32 s0, 0x8000
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX11-SDAG-NEXT:    v_med3_i32 v5, v5, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-SDAG-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v5, v6, v5
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v3, v7, v3
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v5
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v5, v5, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v6.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v1.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v7.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    s_movk_i32 s0, 0x8000
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v5, v5, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v3, v3, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v2, v2, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_med3_i32 v4, v4, s0, 0x7fff
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v5, v6, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v3, v7, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i32:
 ; GFX6-GISEL:       ; %bb.0:
@@ -911,30 +1135,61 @@ define <4 x half> @test_ldexp_v4f16_v4i32(<4 x half> %a, <4 x i32> %b) {
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i32:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7fff
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v6
-; GFX11-GISEL-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
-; GFX11-GISEL-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v6
-; GFX11-GISEL-NEXT:    v_med3_i32 v5, 0xffff8000, v5, v6
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v4
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v7, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v3, v8, v5
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v4f16_v4i32:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_med3_i32 v5, 0xffff8000, v5, v6
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v1.l, v4.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v7.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v8.l, v5.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i32:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v6, 0x7fff
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v2, 0xffff8000, v2, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v4, 0xffff8000, v4, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v3, 0xffff8000, v3, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_med3_i32 v5, 0xffff8000, v5, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v4
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v7, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v3, v8, v5
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x half> @llvm.ldexp.v4f16.v4i32(<4 x half> %a, <4 x i32> %b)
   ret <4 x half> %result
 }
@@ -983,22 +1238,45 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX9-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i16:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v2, v6, v5
-; GFX11-SDAG-NEXT:    v_ldexp_f16_e32 v3, v7, v4
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
+; GFX11-SDAG-TRUE16:       ; %bb.0:
+; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v1.l, v3.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v6.l, v5.l
+; GFX11-SDAG-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v7.l, v4.l
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v2, v0
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v1, v3, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
+; GFX11-SDAG-FAKE16:       ; %bb.0:
+; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v6, v5
+; GFX11-SDAG-FAKE16-NEXT:    v_ldexp_f16_e32 v3, v7, v4
+; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v2
+; GFX11-SDAG-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-GISEL-LABEL: test_ldexp_v4f16_v4i16:
 ; GFX6-GISEL:       ; %bb.0:
@@ -1043,25 +1321,51 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i16:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v2, v4, v6
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v3, v5, v7
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
-; GFX11-GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v4f16_v4i16:
+; GFX11-GISEL-TRUE16:       ; %bb.0:
+; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.l, v0.l, v2.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v0.h, v1.l, v3.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.l, v4.l, v6.l
+; GFX11-GISEL-TRUE16-NEXT:    v_ldexp_f16_e32 v1.h, v5.l, v7.l
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
+; GFX11-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_and_b32_e32 v4, 0xffff, v0
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v0, v3, 16, v2
+; GFX11-GISEL-TRUE16-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
+; GFX11-GISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i16:
+; GFX11-GISEL-FAKE16:       ; %bb.0:
+; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v0, v0, v2
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v1, v1, v3
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v2, v4, v6
+; GFX11-GISEL-FAKE16-NEXT:    v_ldexp_f16_e32 v3, v5, v7
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX11-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX11-GISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x half> @llvm.ldexp.v4f16.v4i16(<4 x half> %a, <4 x i16> %b)
   ret <4 x half> %result
 }
diff --git a/llvm/test/CodeGen/Hexagon/mask-instr.ll b/llvm/test/CodeGen/Hexagon/mask-instr.ll
new file mode 100644
index 00000000000000..ab60a6fcade739
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/mask-instr.ll
@@ -0,0 +1,26 @@
+; Enable Utlilize mask instruction pass only on v66 and above.
+; RUN: llc -mv60 -march=hexagon < %s -o /dev/null
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+@b = dso_local local_unnamed_addr global i8 0, align 1
+@a = dso_local local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: cold nounwind optsize memory(readwrite, argmem: none, inaccessiblemem: none)
+define dso_local void @c() local_unnamed_addr {
+entry:
+  %0 = tail call i32 asm "", "=&r"()
+  %and = and i32 %0, 134217727
+  %tobool.not = icmp eq i32 %and, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load i8, ptr @b, align 1
+  %loadedv = zext nneg i8 %1 to i32
+  store i32 %loadedv, ptr @a, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-crash-zvl32b.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-crash-zvl32b.ll
new file mode 100644
index 00000000000000..e020fe1a0aa1ac
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-crash-zvl32b.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv64 -mattr=+zve32x | FileCheck %s
+
+; Make sure we don't with VLEN=32.
+
+define void @c() {
+; CHECK-LABEL: c:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lw a0, 0(zero)
+; CHECK-NEXT:    sw a0, 0(zero)
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0.p0.i64(ptr null, ptr null, i64 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
index 06b31657e0eca0..a4d58985b75de5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vwadd-sdnode.ll
@@ -1501,18 +1501,23 @@ define <vscale x 8 x i32> @vwadd_vx_splat_zext_i1(<vscale x 8 x i1> %va, i16 %b)
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a0, a0, 16
 ; RV32-NEXT:    srli a0, a0, 16
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vadd.vi v8, v8, 1, v0.t
+; RV32-NEXT:    addi a0, a0, 1
+; RV32-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwadd_vx_splat_zext_i1:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a0, a0, 48
 ; RV64-NEXT:    srli a0, a0, 48
-; RV64-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
+; RV64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vadd.vi v8, v8, 1, v0.t
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV64-NEXT:    vwaddu.vx v8, v12, a0, v0.t
 ; RV64-NEXT:    ret
   %zb = zext i16 %b to i32
   %head = insertelement <vscale x 8 x i32> poison, i32 %zb, i32 0
@@ -1570,20 +1575,23 @@ define <vscale x 8 x i32> @vwadd_vx_splat_sext_i1(<vscale x 8 x i1> %va, i16 %b)
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    slli a0, a0, 16
 ; RV32-NEXT:    srai a0, a0, 16
-; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    li a0, 1
-; RV32-NEXT:    vsub.vx v8, v8, a0, v0.t
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    vmerge.vxm v8, v8, a0, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vwadd_vx_splat_sext_i1:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    slli a0, a0, 48
 ; RV64-NEXT:    srai a0, a0, 48
-; RV64-NEXT:    vsetvli a1, zero, e32, m4, ta, mu
+; RV64-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
 ; RV64-NEXT:    li a0, 1
-; RV64-NEXT:    vsub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, mu
+; RV64-NEXT:    vwsub.vx v8, v12, a0, v0.t
 ; RV64-NEXT:    ret
   %sb = sext i16 %b to i32
   %head = insertelement <vscale x 8 x i32> poison, i32 %sb, i32 0
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index 17d22607cfea88..230afd1461935d 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -118,3 +118,26 @@ define <4 x float> @concat_of_broadcast_v4f32_v8f32(ptr %a0, ptr %a1, ptr %a2) {
   %shuffle1 = shufflevector <8 x float> %ld2, <8 x float> %shuffle, <4 x i32> <i32 6, i32 15, i32 12, i32 3>
   ret <4 x float> %shuffle1
 }
+
+define <4 x i64> @broadcast_of_shuffle_v2i64_v4i64(<16 x i8> %vecinit.i) {
+; AVX1-LABEL: broadcast_of_shuffle_v2i64_v4i64:
+; AVX1:       # %bb.0: # %entry
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: broadcast_of_shuffle_v2i64_v4i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastq %xmm0, %ymm0
+; AVX2-NEXT:    retq
+entry:
+  %vecinit15.i = shufflevector <16 x i8> %vecinit.i, <16 x i8> poison, <16 x i32> zeroinitializer
+  %0 = bitcast <16 x i8> %vecinit15.i to <2 x i64>
+  %1 = extractelement <2 x i64> %0, i64 0
+  %2 = and i64 %1, -72057594037927936 ; 0xFF00 0000 0000 0000
+  %3 = insertelement <4 x i64> poison, i64 %2, i64 0
+  %4 = shufflevector <4 x i64> %3, <4 x i64> poison, <4 x i32> zeroinitializer
+  ret <4 x i64> %4
+}
diff --git a/llvm/test/CodeGen/X86/cttz.ll b/llvm/test/CodeGen/X86/cttz.ll
index 6eb748a1afbab5..b35a1b72fcb6f1 100644
--- a/llvm/test/CodeGen/X86/cttz.ll
+++ b/llvm/test/CodeGen/X86/cttz.ll
@@ -317,13 +317,11 @@ define i32 @cttz_i32_zero_test(i32 %n) {
 ;
 ; X64-LABEL: cttz_i32_zero_test:
 ; X64:       # %bb.0:
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB6_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB6_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
 ;
 ; X86-CLZ-LABEL: cttz_i32_zero_test:
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index df11a44626e381..d5d604a138a719 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -54,13 +54,12 @@ define i32 @or_maybe_zero(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: or_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    orl %esi, %edi
-; X64-NEXT:    je .LBB1_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB1_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = or i32 %x, %y
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -115,13 +114,10 @@ define i32 @select_maybe_zero(i1 %c, i32 %x) {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    testb $1, %dil
 ; X64-NEXT:    cmovnel %esi, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB3_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB3_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %y = or i32 %x, 1
   %z = select i1 %c, i32 %y, i32 0
@@ -216,16 +212,14 @@ define i32 @shl_maybe_zero(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: shl_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %esi
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB7_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB7_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = shl nuw nsw i32 %y, %x
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -275,13 +269,10 @@ define i32 @uaddsat_maybe_zero(i32 %x, i32 %y) {
 ; X64-NEXT:    addl %esi, %edi
 ; X64-NEXT:    movl $-1, %eax
 ; X64-NEXT:    cmovael %edi, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB9_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB9_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.uadd.sat.i32(i32 %x, i32 %y)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -334,15 +325,13 @@ define i32 @umax_maybe_zero(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: umax_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    cmpl %esi, %edi
 ; X64-NEXT:    cmoval %edi, %esi
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB11_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB11_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.umax.i32(i32 %x, i32 %y)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -401,13 +390,10 @@ define i32 @umin_maybe_zero(i32 %x, i32 %y) {
 ; X64-NEXT:    cmpl $54, %edi
 ; X64-NEXT:    movl $54, %eax
 ; X64-NEXT:    cmovbl %edi, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB13_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB13_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.umin.i32(i32 %x, i32 54)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -522,13 +508,10 @@ define i32 @smin_maybe_zero(i32 %x, i32 %y) {
 ; X64-NEXT:    cmpl $54, %edi
 ; X64-NEXT:    movl $54, %eax
 ; X64-NEXT:    cmovll %edi, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB17_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB17_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.smin.i32(i32 %x, i32 54)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -643,13 +626,10 @@ define i32 @smax_known_zero(i32 %x, i32 %y) {
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    movl $-1, %eax
 ; X64-NEXT:    cmovnsl %edi, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB21_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB21_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.smax.i32(i32 %x, i32 -1)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -676,16 +656,9 @@ define i32 @rotr_known_nonzero(i32 %xx, i32 %y) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    orl $256, %edi # imm = 0x100
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %eax
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB22_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB22_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    rep bsfl %edi, %eax
 ; X64-NEXT:    retq
   %x = or i32 %xx, 256
   %shr = lshr i32 %x, %y
@@ -714,16 +687,13 @@ define i32 @rotr_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotr_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %eax
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB23_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB23_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %shr = lshr i32 %x, %y
   %sub = sub i32 32, %y
@@ -775,16 +745,13 @@ define i32 @rotr_with_fshr_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotr_with_fshr_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rorl %cl, %eax
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB25_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB25_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rorl %cl, %edi
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 %y)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -811,16 +778,9 @@ define i32 @rotl_known_nonzero(i32 %xx, i32 %y) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    orl $256, %edi # imm = 0x100
-; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %eax
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB26_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB26_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    rep bsfl %edi, %eax
 ; X64-NEXT:    retq
   %x = or i32 %xx, 256
   %shl = shl i32 %x, %y
@@ -849,16 +809,13 @@ define i32 @rotl_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotl_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %eax
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB27_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB27_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %shl = shl i32 %x, %y
   %sub = sub i32 32, %y
@@ -910,16 +867,13 @@ define i32 @rotl_with_fshl_maybe_zero(i32 %x, i32 %y) {
 ; X64-LABEL: rotl_with_fshl_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %eax
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB29_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB29_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    roll %cl, %edi
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %y)
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -989,16 +943,14 @@ define i32 @sra_maybe_zero(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: sra_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    sarl %cl, %esi
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB32_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB32_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = ashr exact i32 %y, %x
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1068,16 +1020,14 @@ define i32 @srl_maybe_zero(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: srl_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %esi
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB35_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB35_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = lshr exact i32 %y, %x
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1128,13 +1078,11 @@ define i32 @udiv_maybe_zero(i32 %x, i32 %y) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    divl %esi
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB37_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB37_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = udiv exact i32 %x, %y
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1185,13 +1133,11 @@ define i32 @sdiv_maybe_zero(i32 %x, i32 %y) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    cltd
 ; X64-NEXT:    idivl %esi
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB39_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB39_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = sdiv exact i32 %x, %y
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1235,14 +1181,13 @@ define i32 @add_maybe_zero(i32 %xx, i32 %y) {
 ;
 ; X64-LABEL: add_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    orl $1, %edi
 ; X64-NEXT:    addl %esi, %edi
-; X64-NEXT:    je .LBB41_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB41_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x = or i32 %xx, 1
   %z = add nsw i32 %x, %y
@@ -1321,12 +1266,10 @@ define i32 @sub_maybe_zero(i32 %x) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    orl $64, %eax
 ; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    je .LBB44_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB44_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %y = or i32 %x, 64
   %z = sub i32 %y, %x
@@ -1349,13 +1292,12 @@ define i32 @sub_maybe_zero2(i32 %x) {
 ;
 ; X64-LABEL: sub_maybe_zero2:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    negl %edi
-; X64-NEXT:    je .LBB45_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB45_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = sub i32 0, %x
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1379,15 +1321,13 @@ define i32 @mul_known_nonzero_nsw(i32 %x, i32 %yy) {
 ;
 ; X64-LABEL: mul_known_nonzero_nsw:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    orl $256, %esi # imm = 0x100
 ; X64-NEXT:    imull %edi, %esi
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB46_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB46_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %y = or i32 %yy, 256
   %z = mul nsw i32 %y, %x
@@ -1412,15 +1352,13 @@ define i32 @mul_known_nonzero_nuw(i32 %x, i32 %yy) {
 ;
 ; X64-LABEL: mul_known_nonzero_nuw:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    orl $256, %esi # imm = 0x100
 ; X64-NEXT:    imull %edi, %esi
-; X64-NEXT:    testl %esi, %esi
-; X64-NEXT:    je .LBB47_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB47_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %y = or i32 %yy, 256
   %z = mul nuw i32 %y, %x
@@ -1444,14 +1382,12 @@ define i32 @mul_maybe_zero(i32 %x, i32 %y) {
 ;
 ; X64-LABEL: mul_maybe_zero:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    imull %esi, %edi
-; X64-NEXT:    testl %edi, %edi
-; X64-NEXT:    je .LBB48_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %edi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB48_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rax # imm = 0x100000000
+; X64-NEXT:    orq %rdi, %rax
+; X64-NEXT:    rep bsfq %rax, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = mul nuw nsw i32 %y, %x
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1482,9 +1418,10 @@ define i32 @bitcast_known_nonzero(<2 x i16> %xx) {
 ; X64-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,256,u,u,u,u,u,u]
 ; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    bsfl %eax, %ecx
-; X64-NEXT:    movl $32, %eax
-; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x = shl nuw nsw <2 x i16> <i16 256, i16 256>, %xx
   %z = bitcast <2 x i16> %x to i32
@@ -1508,13 +1445,10 @@ define i32 @bitcast_maybe_zero(<2 x i16> %x) {
 ; X64-LABEL: bitcast_maybe_zero:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB50_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB50_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = bitcast <2 x i16> %x to i32
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1538,13 +1472,10 @@ define i32 @bitcast_from_float(float %x) {
 ; X64-LABEL: bitcast_from_float:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovd %xmm0, %eax
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB51_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB51_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = bitcast float %x to i32
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1592,14 +1523,11 @@ define i32 @zext_maybe_zero(i16 %x) {
 ;
 ; X64-LABEL: zext_maybe_zero:
 ; X64:       # %bb.0:
-; X64-NEXT:    testw %di, %di
-; X64-NEXT:    je .LBB53_1
-; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB53_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = zext i16 %x to i32
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
@@ -1646,14 +1574,11 @@ define i32 @sext_maybe_zero(i16 %x) {
 ;
 ; X64-LABEL: sext_maybe_zero:
 ; X64:       # %bb.0:
-; X64-NEXT:    testw %di, %di
-; X64-NEXT:    je .LBB55_1
-; X64-NEXT:  # %bb.2: # %cond.false
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB55_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %z = sext i16 %x to i32
   %r = call i32 @llvm.cttz.i32(i32 %z, i1 false)
diff --git a/llvm/test/CodeGen/X86/pr89877.ll b/llvm/test/CodeGen/X86/pr89877.ll
index 9820ec42f5b8cc..fdbe75b467d992 100644
--- a/llvm/test/CodeGen/X86/pr89877.ll
+++ b/llvm/test/CodeGen/X86/pr89877.ll
@@ -24,14 +24,11 @@ define i32 @sext_known_nonzero(i16 %xx) {
 ; X64-NEXT:    movl $256, %eax # imm = 0x100
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
-; X64-NEXT:    cwtl
-; X64-NEXT:    testl %eax, %eax
-; X64-NEXT:    je .LBB0_1
-; X64-NEXT:  # %bb.2: # %cond.false
-; X64-NEXT:    rep bsfl %eax, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB0_1:
-; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    movswq %ax, %rax
+; X64-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    rep bsfq %rcx, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %x = shl i16 256, %xx
   %z = sext i16 %x to i32
diff --git a/llvm/test/CodeGen/X86/pr90847.ll b/llvm/test/CodeGen/X86/pr90847.ll
index 7aa0ceb26e1acb..f2d43c3ed8d5bd 100644
--- a/llvm/test/CodeGen/X86/pr90847.ll
+++ b/llvm/test/CodeGen/X86/pr90847.ll
@@ -15,14 +15,10 @@ define i32 @PR90847(<8 x float> %x) nounwind {
 ; AVX1-NEXT:    vminps %ymm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vcmpeqps %ymm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vmovmskps %ymm0, %eax
-; AVX1-NEXT:    testl %eax, %eax
-; AVX1-NEXT:    je .LBB0_1
-; AVX1-NEXT:  # %bb.2: # %cond.false
-; AVX1-NEXT:    rep bsfl %eax, %eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB0_1:
-; AVX1-NEXT:    movl $32, %eax
+; AVX1-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; AVX1-NEXT:    orq %rax, %rcx
+; AVX1-NEXT:    rep bsfq %rcx, %rax
+; AVX1-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -36,14 +32,10 @@ define i32 @PR90847(<8 x float> %x) nounwind {
 ; AVX2-NEXT:    vminps %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vcmpeqps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vmovmskps %ymm0, %eax
-; AVX2-NEXT:    testl %eax, %eax
-; AVX2-NEXT:    je .LBB0_1
-; AVX2-NEXT:  # %bb.2: # %cond.false
-; AVX2-NEXT:    rep bsfl %eax, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-; AVX2-NEXT:  .LBB0_1:
-; AVX2-NEXT:    movl $32, %eax
+; AVX2-NEXT:    movabsq $4294967296, %rcx # imm = 0x100000000
+; AVX2-NEXT:    orq %rax, %rcx
+; AVX2-NEXT:    rep bsfq %rcx, %rax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 entry:
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
index 84ced2fd224997..26ffd3a4e383b2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop2.txt
@@ -783,49 +783,64 @@
 # GFX11: v_fmamk_f32 v255, 0xaf123456, 0xaf123456, v255 ; encoding: [0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf]
 0xff,0xfe,0xff,0x59,0x56,0x34,0x12,0xaf
 
-# GFX11: v_ldexp_f16_e32 v5, v1, v2              ; encoding: [0x01,0x05,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, v1, v2              ; encoding: [0x01,0x05,0x0a,0x76]
+# GFX11-REAL16: v_ldexp_f16_e32 v5.l, v1.l, v2.l        ; encoding: [0x01,0x05,0x0a,0x76]
 0x01,0x05,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, v127, v2            ; encoding: [0x7f,0x05,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, v127, v2            ; encoding: [0x7f,0x05,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, v127.l, v2.l      ; encoding: [0x7f,0x05,0x0a,0x76]
 0x7f,0x05,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, s1, v2              ; encoding: [0x01,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, s1, v2              ; encoding: [0x01,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, s1, v2.l          ; encoding: [0x01,0x04,0x0a,0x76]
 0x01,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, s105, v2            ; encoding: [0x69,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, s105, v2            ; encoding: [0x69,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, s105, v2.l        ; encoding: [0x69,0x04,0x0a,0x76]
 0x69,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, vcc_lo, v2          ; encoding: [0x6a,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, vcc_lo, v2          ; encoding: [0x6a,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, vcc_lo, v2.l      ; encoding: [0x6a,0x04,0x0a,0x76]
 0x6a,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, vcc_hi, v2          ; encoding: [0x6b,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, vcc_hi, v2          ; encoding: [0x6b,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, vcc_hi, v2.l      ; encoding: [0x6b,0x04,0x0a,0x76]
 0x6b,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, ttmp15, v2          ; encoding: [0x7b,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, ttmp15, v2          ; encoding: [0x7b,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, ttmp15, v2.l      ; encoding: [0x7b,0x04,0x0a,0x76]
 0x7b,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, m0, v2              ; encoding: [0x7d,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, m0, v2              ; encoding: [0x7d,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, m0, v2.l          ; encoding: [0x7d,0x04,0x0a,0x76]
 0x7d,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, exec_lo, v2         ; encoding: [0x7e,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, exec_lo, v2         ; encoding: [0x7e,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, exec_lo, v2.l     ; encoding: [0x7e,0x04,0x0a,0x76]
 0x7e,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, exec_hi, v2         ; encoding: [0x7f,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, exec_hi, v2         ; encoding: [0x7f,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, exec_hi, v2.l     ; encoding: [0x7f,0x04,0x0a,0x76]
 0x7f,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, null, v2            ; encoding: [0x7c,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, null, v2            ; encoding: [0x7c,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, null, v2.l        ; encoding: [0x7c,0x04,0x0a,0x76]
 0x7c,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, -1, v2              ; encoding: [0xc1,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, -1, v2              ; encoding: [0xc1,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, -1, v2.l          ; encoding: [0xc1,0x04,0x0a,0x76]
 0xc1,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, 0.5, v2             ; encoding: [0xf0,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, 0.5, v2             ; encoding: [0xf0,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, 0.5, v2.l         ; encoding: [0xf0,0x04,0x0a,0x76]
 0xf0,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v5, src_scc, v2         ; encoding: [0xfd,0x04,0x0a,0x76]
+# GFX11-FAKE16: v_ldexp_f16_e32 v5, src_scc, v2         ; encoding: [0xfd,0x04,0x0a,0x76]
+# GFX11-REAL16:	v_ldexp_f16_e32 v5.l, src_scc, v2.l     ; encoding: [0xfd,0x04,0x0a,0x76]
 0xfd,0x04,0x0a,0x76
 
-# GFX11: v_ldexp_f16_e32 v127, 0xfe0b, v127      ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_ldexp_f16_e32 v127, 0xfe0b, v127      ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16:	v_ldexp_f16_e32 v127.l, 0xfe0b, v127.l  ; encoding: [0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00]
 0xff,0xfe,0xfe,0x76,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_lshlrev_b32_e32 v5, v1, v2            ; encoding: [0x01,0x05,0x0a,0x30]
diff --git a/llvm/test/MC/X86/apx/ccmp-att.s b/llvm/test/MC/X86/apx/ccmp-att.s
index e7dd91d7da66cc..6e919bcd7f57fe 100644
--- a/llvm/test/MC/X86/apx/ccmp-att.s
+++ b/llvm/test/MC/X86/apx/ccmp-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-454: error:
+# ERROR-COUNT-466: error:
 # ERROR-NOT: error:
 ## Condition flags
 
@@ -1376,3 +1376,42 @@
 # CHECK: ccmptq	{dfv=}	%r9, %r15
 # CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0xcf]
          {evex} cmpq	%r9, %r15
+
+## Condition Code Aliases
+
+# CHECK: ccmpbl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x39,0xca]
+         ccmpcl {dfv=of} %ecx, %edx
+# CHECK: ccmpbl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x39,0xca]
+         ccmpnael {dfv=of} %ecx, %edx
+# CHECK: ccmpael {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x39,0xca]
+         ccmpnbl {dfv=of} %ecx, %edx
+# CHECK: ccmpael {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x39,0xca]
+         ccmpncl {dfv=of} %ecx, %edx
+# CHECK: ccmpel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x39,0xca]
+         ccmpzl {dfv=of} %ecx, %edx
+# CHECK: ccmpnel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x39,0xca]
+         ccmpnzl {dfv=of} %ecx, %edx
+# CHECK: ccmpal {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x39,0xca]
+         ccmpnbel {dfv=of} %ecx, %edx
+# CHECK: ccmpll {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x39,0xca]
+         ccmpngel {dfv=of} %ecx, %edx
+# CHECK: ccmpgel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x39,0xca]
+         ccmpnll {dfv=of} %ecx, %edx
+# CHECK: ccmplel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x39,0xca]
+         ccmpngl {dfv=of} %ecx, %edx
+# CHECK: ccmpgl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x39,0xca]
+         ccmpnlel {dfv=of} %ecx, %edx
+# CHECK: ccmpbel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x39,0xca]
+         ccmpnal {dfv=of} %ecx, %edx
\ No newline at end of file
diff --git a/llvm/test/MC/X86/apx/ccmp-intel.s b/llvm/test/MC/X86/apx/ccmp-intel.s
index ec3f72b8a0a8d1..c537633487294f 100644
--- a/llvm/test/MC/X86/apx/ccmp-intel.s
+++ b/llvm/test/MC/X86/apx/ccmp-intel.s
@@ -1373,3 +1373,42 @@
 # CHECK: ccmpt	{dfv=}	r15, r9
 # CHECK: encoding: [0x62,0x54,0x84,0x0a,0x39,0xcf]
          {evex} cmp	r15, r9
+
+## Condition Code Aliases
+
+# CHECK: ccmpb {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x39,0xca]
+         ccmpc {dfv=of} edx, ecx
+# CHECK: ccmpb {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x39,0xca]
+         ccmpnae {dfv=of} edx, ecx
+# CHECK: ccmpae {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x39,0xca]
+         ccmpnb {dfv=of} edx, ecx
+# CHECK: ccmpae {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x39,0xca]
+         ccmpnc {dfv=of} edx, ecx
+# CHECK: ccmpe {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x39,0xca]
+         ccmpz {dfv=of} edx, ecx
+# CHECK: ccmpne {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x39,0xca]
+         ccmpnz {dfv=of} edx, ecx
+# CHECK: ccmpa {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x39,0xca]
+         ccmpnbe {dfv=of} edx, ecx
+# CHECK: ccmpl {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x39,0xca]
+         ccmpnge {dfv=of} edx, ecx
+# CHECK: ccmpge {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x39,0xca]
+         ccmpnl {dfv=of} edx, ecx
+# CHECK: ccmple {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x39,0xca]
+         ccmpng {dfv=of} edx, ecx
+# CHECK: ccmpg {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x39,0xca]
+         ccmpnle {dfv=of} edx, ecx
+# CHECK: ccmpbe {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x39,0xca]
+         ccmpna {dfv=of} edx, ecx
diff --git a/llvm/test/MC/X86/apx/ctest-att.s b/llvm/test/MC/X86/apx/ctest-att.s
index 4cb928748a1d26..3b6e0b4508432f 100644
--- a/llvm/test/MC/X86/apx/ctest-att.s
+++ b/llvm/test/MC/X86/apx/ctest-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-276: error:
+# ERROR-COUNT-288: error:
 # ERROR-NOT: error:
 # CHECK: ctestbb {dfv=of} $123, 123(%r8,%rax,4)
 # CHECK: encoding: [0x62,0xd4,0x44,0x02,0xf6,0x44,0x80,0x7b,0x7b]
@@ -835,3 +835,42 @@
 # CHECK: ctesttq	{dfv=}	%r9, %r15
 # CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0xcf]
          {evex} testq	%r9, %r15
+
+## Condition Code Aliases
+
+# CHECK: ctestbl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestcl {dfv=of} %ecx, %edx
+# CHECK: ctestbl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestnael {dfv=of} %ecx, %edx
+# CHECK: ctestael {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestnbl {dfv=of} %ecx, %edx
+# CHECK: ctestael {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestncl {dfv=of} %ecx, %edx
+# CHECK: ctestel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x85,0xca]
+         ctestzl {dfv=of} %ecx, %edx
+# CHECK: ctestnel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x85,0xca]
+         ctestnzl {dfv=of} %ecx, %edx
+# CHECK: ctestal {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x85,0xca]
+         ctestnbel {dfv=of} %ecx, %edx
+# CHECK: ctestll {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x85,0xca]
+         ctestngel {dfv=of} %ecx, %edx
+# CHECK: ctestgel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x85,0xca]
+         ctestnll {dfv=of} %ecx, %edx
+# CHECK: ctestlel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x85,0xca]
+         ctestngl {dfv=of} %ecx, %edx
+# CHECK: ctestgl {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x85,0xca]
+         ctestnlel {dfv=of} %ecx, %edx
+# CHECK: ctestbel {dfv=of} %ecx, %edx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x85,0xca]
+         ctestnal {dfv=of} %ecx, %edx
diff --git a/llvm/test/MC/X86/apx/ctest-intel.s b/llvm/test/MC/X86/apx/ctest-intel.s
index 701c517e27a798..ad065f8d767360 100644
--- a/llvm/test/MC/X86/apx/ctest-intel.s
+++ b/llvm/test/MC/X86/apx/ctest-intel.s
@@ -831,3 +831,42 @@
 # CHECK: ctestt	{dfv=}	r15, r9
 # CHECK: encoding: [0x62,0x54,0x84,0x0a,0x85,0xcf]
          {evex} test	r15, r9
+
+## Condition Code Aliases
+
+# CHECK: ctestb {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestc {dfv=of} edx, ecx
+# CHECK: ctestb {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x02,0x85,0xca]
+         ctestnae {dfv=of} edx, ecx
+# CHECK: ctestae {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestnb {dfv=of} edx, ecx
+# CHECK: ctestae {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x03,0x85,0xca]
+         ctestnc {dfv=of} edx, ecx
+# CHECK: cteste {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x04,0x85,0xca]
+         ctestz {dfv=of} edx, ecx
+# CHECK: ctestne {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x05,0x85,0xca]
+         ctestnz {dfv=of} edx, ecx
+# CHECK: ctesta {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x07,0x85,0xca]
+         ctestnbe {dfv=of} edx, ecx
+# CHECK: ctestl {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0c,0x85,0xca]
+         ctestnge {dfv=of} edx, ecx
+# CHECK: ctestge {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0d,0x85,0xca]
+         ctestnl {dfv=of} edx, ecx
+# CHECK: ctestle {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0e,0x85,0xca]
+         ctestng {dfv=of} edx, ecx
+# CHECK: ctestg {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x0f,0x85,0xca]
+         ctestnle {dfv=of} edx, ecx
+# CHECK: ctestbe {dfv=of} edx, ecx
+# CHECK: encoding: [0x62,0xf4,0x44,0x06,0x85,0xca]
+         ctestna {dfv=of} edx, ecx
diff --git a/llvm/test/MC/X86/apx/setzucc-att.s b/llvm/test/MC/X86/apx/setzucc-att.s
index b4b7a633fa319a..941057b1e72fc6 100644
--- a/llvm/test/MC/X86/apx/setzucc-att.s
+++ b/llvm/test/MC/X86/apx/setzucc-att.s
@@ -1,7 +1,7 @@
 # RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
 # RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
 
-# ERROR-COUNT-32: error:
+# ERROR-COUNT-46: error:
 # ERROR-NOT: error:
 # CHECK: setzuo	%al
 # CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x40,0xc0]
@@ -99,3 +99,47 @@
 # CHECK: setzug	(%rax)
 # CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4f,0x00]
          setzug	(%rax)
+
+# Alias tests:
+# CHECK: setzub	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x42,0xc0]
+         setzunae	%al
+# CHECK: setzub	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x42,0xc0]
+         setzuc	%al
+# CHECK: setzuae	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x43,0xc0]
+         setzunb	%al
+# CHECK: setzuae	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x43,0xc0]
+         setzunc	%al
+# CHECK: setzue	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x44,0xc0]
+         setzuz	%al
+# CHECK: setzune	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x45,0xc0]
+         setzunz	%al
+# CHECK: setzube	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x46,0xc0]
+         setzuna	%al
+# CHECK: setzua	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x47,0xc0]
+         setzunbe	%al
+# CHECK: setzup	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4a,0xc0]
+         setzupe	%al
+# CHECK: setzunp	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4b,0xc0]
+         setzupo	%al
+# CHECK: setzul	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4c,0xc0]
+         setzunge	%al
+# CHECK: setzuge	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4d,0xc0]
+         setzunl	%al
+# CHECK: setzule	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4e,0xc0]
+         setzung	%al
+# CHECK: setzug	%al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4f,0xc0]
+         setzunle	%al
diff --git a/llvm/test/MC/X86/apx/setzucc-intel.s b/llvm/test/MC/X86/apx/setzucc-intel.s
index bdefba6ac8d30a..d5476e5c1534f5 100644
--- a/llvm/test/MC/X86/apx/setzucc-intel.s
+++ b/llvm/test/MC/X86/apx/setzucc-intel.s
@@ -96,3 +96,47 @@
 # CHECK: setzug	byte ptr [rax]
 # CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4f,0x00]
          setzug	byte ptr [rax]
+
+# Alias tests:
+# CHECK: setzub	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x42,0xc0]
+         setzunae	al
+# CHECK: setzub	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x42,0xc0]
+         setzuc	al
+# CHECK: setzuae	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x43,0xc0]
+         setzunb	al
+# CHECK: setzuae	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x43,0xc0]
+         setzunc	al
+# CHECK: setzue	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x44,0xc0]
+         setzuz	al
+# CHECK: setzune	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x45,0xc0]
+         setzunz	al
+# CHECK: setzube	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x46,0xc0]
+         setzuna	al
+# CHECK: setzua	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x47,0xc0]
+         setzunbe	al
+# CHECK: setzup	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4a,0xc0]
+         setzupe	al
+# CHECK: setzunp	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4b,0xc0]
+         setzupo	al
+# CHECK: setzul	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4c,0xc0]
+         setzunge	al
+# CHECK: setzuge	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4d,0xc0]
+         setzunl	al
+# CHECK: setzule	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4e,0xc0]
+         setzung	al
+# CHECK: setzug	al
+# CHECK: encoding: [0x62,0xf4,0x7f,0x18,0x4f,0xc0]
+         setzunle	al
\ No newline at end of file
diff --git a/llvm/test/MC/X86/cmpccxadd-att-alias.s b/llvm/test/MC/X86/cmpccxadd-att-alias.s
index e4d8fff0a38962..dcc0f105d7abc1 100644
--- a/llvm/test/MC/X86/cmpccxadd-att-alias.s
+++ b/llvm/test/MC/X86/cmpccxadd-att-alias.s
@@ -24,3 +24,35 @@
 // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00]
           cmpgxadd  %eax, %ecx, (%rip)
 
+// CHECK: cmpbxadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00]
+          cmpcxadd  %eax, %ecx, (%rip)
+
+// CHECK: cmpbxadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00]
+          cmpnaexadd  %eax, %ecx, (%rip)
+
+// CHECK: cmpnbxadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00]
+          cmpncxadd  %eax, %ecx, (%rip)
+
+// CHECK: cmpbexadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe6,0x0d,0x00,0x00,0x00,0x00]
+          cmpnaxadd  %eax, %ecx, (%rip)
+
+// CHECK: cmplexadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xee,0x0d,0x00,0x00,0x00,0x00]
+          cmpngxadd  %eax, %ecx, (%rip)
+
+// CHECK: cmppxadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xea,0x0d,0x00,0x00,0x00,0x00]
+          cmppexadd  %eax, %ecx, (%rip)
+
+// CHECK: cmpnpxadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xeb,0x0d,0x00,0x00,0x00,0x00]
+          cmppoxadd  %eax, %ecx, (%rip)
+
+// CHECK: cmplxadd  %eax, %ecx, (%rip)
+// CHECK: encoding: [0xc4,0xe2,0x79,0xec,0x0d,0x00,0x00,0x00,0x00]
+          cmpngexadd  %eax, %ecx, (%rip)
+
diff --git a/llvm/test/MC/X86/cmpccxadd-intel-alias.s b/llvm/test/MC/X86/cmpccxadd-intel-alias.s
index 68a8736a3b688c..f5c7a6b6a2e0a5 100644
--- a/llvm/test/MC/X86/cmpccxadd-intel-alias.s
+++ b/llvm/test/MC/X86/cmpccxadd-intel-alias.s
@@ -23,3 +23,35 @@
 // CHECK: cmpnlexadd dword ptr [rip], ecx, eax
 // CHECK: encoding: [0xc4,0xe2,0x79,0xef,0x0d,0x00,0x00,0x00,0x00]
           cmpgxadd dword ptr [rip], ecx, eax
+
+// CHECK: cmpbxadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00]
+          cmpcxadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmpbxadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe2,0x0d,0x00,0x00,0x00,0x00]
+          cmpnaexadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmpnbxadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe3,0x0d,0x00,0x00,0x00,0x00]
+          cmpncxadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmpbexadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xe6,0x0d,0x00,0x00,0x00,0x00]
+          cmpnaxadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmplexadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xee,0x0d,0x00,0x00,0x00,0x00]
+          cmpngxadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmppxadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xea,0x0d,0x00,0x00,0x00,0x00]
+          cmppexadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmpnpxadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xeb,0x0d,0x00,0x00,0x00,0x00]
+          cmppoxadd  dword ptr [rip], ecx, eax
+
+// CHECK: cmplxadd  dword ptr [rip], ecx, eax
+// CHECK: encoding: [0xc4,0xe2,0x79,0xec,0x0d,0x00,0x00,0x00,0x00]
+          cmpngexadd  dword ptr [rip], ecx, eax
diff --git a/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll b/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll
new file mode 100644
index 00000000000000..c7fc1dc6996718
--- /dev/null
+++ b/llvm/test/Transforms/AlignmentFromAssumptions/domtree-crash.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=alignment-from-assumptions -S < %s | FileCheck %s
+
+; The alignment assumption is a global, which has users in a different
+; function. Test that in this case the dominator tree is only queried with
+; blocks from the same function.
+
+@global = external constant [192 x i8]
+
+define void @fn1() {
+; CHECK-LABEL: define void @fn1() {
+; CHECK-NEXT:    call void @llvm.assume(i1 false) [ "align"(ptr @global, i64 1) ]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 false) [ "align"(ptr @global, i64 1) ]
+  ret void
+}
+
+define void @fn2() {
+; CHECK-LABEL: define void @fn2() {
+; CHECK-NEXT:    ret void
+; CHECK:       [[LOOP:.*]]:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr @global, i64 0
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    br label %[[LOOP]]
+;
+  ret void
+
+loop:
+  %gep = getelementptr inbounds i8, ptr @global, i64 0
+  %load = load i64, ptr %gep, align 1
+  br label %loop
+}
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
index ba6802f85c03cd..fa2ad60db7c291 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
@@ -6,17 +6,17 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
@@ -27,17 +27,17 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
index ef2b5fe3672be0..95a52aa0f7f527 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -4,23 +4,23 @@
 
 define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f16(
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast half [[VAL:%.*]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i16
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP5]], ptr elementtype(i16) [[PTR]])
-; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i16) [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]])
+; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP4]] to half
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half
 ; CHECK-NEXT:    ret void
 ;
 ; OUTLINE-ATOMICS-LABEL: @atomic_swap_f16(
-; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = bitcast half [[VAL:%.*]] to i16
-; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[PTR:%.*]], i16 [[TMP2]] acquire, align 2
-; OUTLINE-ATOMICS-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP3]] to half
+; OUTLINE-ATOMICS-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16
+; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR:%.*]], i16 [[TMP1]] acquire, align 2
+; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP2]] to half
 ; OUTLINE-ATOMICS-NEXT:    ret void
 ;
   %t1 = atomicrmw xchg ptr %ptr, half %val acquire
@@ -29,23 +29,23 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
 
 define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f32(
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP5]], ptr elementtype(i32) [[PTR]])
-; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i32) [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]])
+; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32 [[TMP4]] to float
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float
 ; CHECK-NEXT:    ret void
 ;
 ; OUTLINE-ATOMICS-LABEL: @atomic_swap_f32(
-; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = bitcast float [[VAL:%.*]] to i32
-; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[PTR:%.*]], i32 [[TMP2]] acquire, align 4
-; OUTLINE-ATOMICS-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
+; OUTLINE-ATOMICS-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32
+; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR:%.*]], i32 [[TMP1]] acquire, align 4
+; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
 ; OUTLINE-ATOMICS-NEXT:    ret void
 ;
   %t1 = atomicrmw xchg ptr %ptr, float %val acquire
@@ -54,21 +54,21 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
 
 define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
 ; CHECK-LABEL: @atomic_swap_f64(
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[VAL:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP2]], ptr elementtype(i64) [[PTR]])
-; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]])
+; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64 [[TMP3]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double
 ; CHECK-NEXT:    ret void
 ;
 ; OUTLINE-ATOMICS-LABEL: @atomic_swap_f64(
-; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = bitcast double [[VAL:%.*]] to i64
-; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[PTR:%.*]], i64 [[TMP2]] acquire, align 8
-; OUTLINE-ATOMICS-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP3]] to double
+; OUTLINE-ATOMICS-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64
+; OUTLINE-ATOMICS-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[PTR:%.*]], i64 [[TMP1]] acquire, align 8
+; OUTLINE-ATOMICS-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
 ; OUTLINE-ATOMICS-NEXT:    ret void
 ;
   %t1 = atomicrmw xchg ptr %ptr, double %val acquire
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll b/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
index cc42407c0210e7..c5c890559152df 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/pcsections.ll
@@ -4,7 +4,7 @@
 define i8 @atomic8_load_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_load_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] unordered, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] unordered, align 1, !pcsections [[META0:![0-9]+]]
 ; CHECK-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
@@ -15,7 +15,7 @@ entry:
 define i8 @atomic8_load_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_load_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] monotonic, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] monotonic, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
@@ -26,7 +26,7 @@ entry:
 define i8 @atomic8_load_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_load_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] acquire, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] acquire, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
@@ -37,7 +37,7 @@ entry:
 define i8 @atomic8_load_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_load_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i8, ptr [[A:%.*]] seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i8 [[TMP0]]
 ;
 entry:
@@ -48,7 +48,7 @@ entry:
 define void @atomic8_store_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_store_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] unordered, align 1, !pcsections !0
+; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] unordered, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -59,7 +59,7 @@ entry:
 define void @atomic8_store_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_store_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] monotonic, align 1, !pcsections !0
+; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] monotonic, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -70,7 +70,7 @@ entry:
 define void @atomic8_store_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_store_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] release, align 1, !pcsections !0
+; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] release, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -81,7 +81,7 @@ entry:
 define void @atomic8_store_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_store_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    store atomic i8 0, ptr [[A:%.*]] seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -92,14 +92,14 @@ entry:
 define void @atomic8_xchg_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xchg_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -111,14 +111,14 @@ entry:
 define void @atomic8_add_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_add_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -130,14 +130,14 @@ entry:
 define void @atomic8_sub_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_sub_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -149,14 +149,14 @@ entry:
 define void @atomic8_and_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_and_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -168,14 +168,14 @@ entry:
 define void @atomic8_or_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_or_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -187,14 +187,14 @@ entry:
 define void @atomic8_xor_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xor_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -206,14 +206,14 @@ entry:
 define void @atomic8_nand_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_nand_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -225,14 +225,14 @@ entry:
 define void @atomic8_xchg_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xchg_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -244,14 +244,14 @@ entry:
 define void @atomic8_add_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_add_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -263,14 +263,14 @@ entry:
 define void @atomic8_sub_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_sub_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -282,14 +282,14 @@ entry:
 define void @atomic8_and_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_and_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -301,14 +301,14 @@ entry:
 define void @atomic8_or_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_or_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -320,14 +320,14 @@ entry:
 define void @atomic8_xor_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xor_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -339,14 +339,14 @@ entry:
 define void @atomic8_nand_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_nand_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -358,14 +358,14 @@ entry:
 define void @atomic8_xchg_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xchg_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -377,14 +377,14 @@ entry:
 define void @atomic8_add_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_add_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -396,14 +396,14 @@ entry:
 define void @atomic8_sub_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_sub_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -415,14 +415,14 @@ entry:
 define void @atomic8_and_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_and_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -434,14 +434,14 @@ entry:
 define void @atomic8_or_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_or_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -453,14 +453,14 @@ entry:
 define void @atomic8_xor_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xor_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -472,14 +472,14 @@ entry:
 define void @atomic8_nand_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_nand_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -491,14 +491,14 @@ entry:
 define void @atomic8_xchg_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xchg_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -510,14 +510,14 @@ entry:
 define void @atomic8_add_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_add_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -529,14 +529,14 @@ entry:
 define void @atomic8_sub_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_sub_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -548,14 +548,14 @@ entry:
 define void @atomic8_and_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_and_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -567,14 +567,14 @@ entry:
 define void @atomic8_or_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_or_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -586,14 +586,14 @@ entry:
 define void @atomic8_xor_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xor_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -605,14 +605,14 @@ entry:
 define void @atomic8_nand_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_nand_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -624,14 +624,14 @@ entry:
 define void @atomic8_xchg_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xchg_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -643,14 +643,14 @@ entry:
 define void @atomic8_add_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_add_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -662,14 +662,14 @@ entry:
 define void @atomic8_sub_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_sub_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -681,14 +681,14 @@ entry:
 define void @atomic8_and_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_and_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 0 seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -700,14 +700,14 @@ entry:
 define void @atomic8_or_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_or_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -719,14 +719,14 @@ entry:
 define void @atomic8_xor_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_xor_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 [[LOADED]] seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -738,14 +738,14 @@ entry:
 define void @atomic8_nand_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_nand_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 seq_cst seq_cst, align 1, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i8 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 [[LOADED]], i8 -1 seq_cst seq_cst, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i8, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i8, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -757,9 +757,9 @@ entry:
 define void @atomic8_cas_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_cas_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 monotonic monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 monotonic monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 monotonic seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -772,9 +772,9 @@ entry:
 define void @atomic8_cas_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_cas_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acquire monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acquire monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acquire seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -787,9 +787,9 @@ entry:
 define void @atomic8_cas_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_cas_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 release monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 release monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 release seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -802,9 +802,9 @@ entry:
 define void @atomic8_cas_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_cas_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acq_rel monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 acq_rel monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 acq_rel seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -817,9 +817,9 @@ entry:
 define void @atomic8_cas_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic8_cas_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 seq_cst monotonic, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst acquire, align 1, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst seq_cst, align 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i8 0, i8 1 seq_cst monotonic, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst acquire, align 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i8 0, i8 1 seq_cst seq_cst, align 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -832,7 +832,7 @@ entry:
 define i16 @atomic16_load_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_load_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] unordered, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] unordered, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
@@ -843,7 +843,7 @@ entry:
 define i16 @atomic16_load_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_load_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] monotonic, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] monotonic, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
@@ -854,7 +854,7 @@ entry:
 define i16 @atomic16_load_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_load_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] acquire, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] acquire, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
@@ -865,7 +865,7 @@ entry:
 define i16 @atomic16_load_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_load_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i16, ptr [[A:%.*]] seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i16 [[TMP0]]
 ;
 entry:
@@ -876,7 +876,7 @@ entry:
 define void @atomic16_store_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_store_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] unordered, align 2, !pcsections !0
+; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] unordered, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -887,7 +887,7 @@ entry:
 define void @atomic16_store_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_store_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] monotonic, align 2, !pcsections !0
+; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] monotonic, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -898,7 +898,7 @@ entry:
 define void @atomic16_store_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_store_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] release, align 2, !pcsections !0
+; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] release, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -909,7 +909,7 @@ entry:
 define void @atomic16_store_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_store_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    store atomic i16 0, ptr [[A:%.*]] seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -920,14 +920,14 @@ entry:
 define void @atomic16_xchg_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xchg_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -939,14 +939,14 @@ entry:
 define void @atomic16_add_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_add_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -958,14 +958,14 @@ entry:
 define void @atomic16_sub_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_sub_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -977,14 +977,14 @@ entry:
 define void @atomic16_and_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_and_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -996,14 +996,14 @@ entry:
 define void @atomic16_or_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_or_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1015,14 +1015,14 @@ entry:
 define void @atomic16_xor_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xor_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1034,14 +1034,14 @@ entry:
 define void @atomic16_nand_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_nand_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1053,14 +1053,14 @@ entry:
 define void @atomic16_xchg_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xchg_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1072,14 +1072,14 @@ entry:
 define void @atomic16_add_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_add_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1091,14 +1091,14 @@ entry:
 define void @atomic16_sub_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_sub_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1110,14 +1110,14 @@ entry:
 define void @atomic16_and_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_and_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1129,14 +1129,14 @@ entry:
 define void @atomic16_or_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_or_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1148,14 +1148,14 @@ entry:
 define void @atomic16_xor_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xor_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1167,14 +1167,14 @@ entry:
 define void @atomic16_nand_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_nand_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1186,14 +1186,14 @@ entry:
 define void @atomic16_xchg_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xchg_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1205,14 +1205,14 @@ entry:
 define void @atomic16_add_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_add_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1224,14 +1224,14 @@ entry:
 define void @atomic16_sub_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_sub_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1243,14 +1243,14 @@ entry:
 define void @atomic16_and_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_and_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1262,14 +1262,14 @@ entry:
 define void @atomic16_or_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_or_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1281,14 +1281,14 @@ entry:
 define void @atomic16_xor_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xor_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1300,14 +1300,14 @@ entry:
 define void @atomic16_nand_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_nand_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1319,14 +1319,14 @@ entry:
 define void @atomic16_xchg_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xchg_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1338,14 +1338,14 @@ entry:
 define void @atomic16_add_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_add_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1357,14 +1357,14 @@ entry:
 define void @atomic16_sub_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_sub_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1376,14 +1376,14 @@ entry:
 define void @atomic16_and_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_and_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1395,14 +1395,14 @@ entry:
 define void @atomic16_or_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_or_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1414,14 +1414,14 @@ entry:
 define void @atomic16_xor_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xor_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1433,14 +1433,14 @@ entry:
 define void @atomic16_nand_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_nand_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1452,14 +1452,14 @@ entry:
 define void @atomic16_xchg_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xchg_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1471,14 +1471,14 @@ entry:
 define void @atomic16_add_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_add_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1490,14 +1490,14 @@ entry:
 define void @atomic16_sub_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_sub_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1509,14 +1509,14 @@ entry:
 define void @atomic16_and_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_and_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 0 seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1528,14 +1528,14 @@ entry:
 define void @atomic16_or_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_or_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1547,14 +1547,14 @@ entry:
 define void @atomic16_xor_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_xor_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 [[LOADED]] seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1566,14 +1566,14 @@ entry:
 define void @atomic16_nand_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_nand_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[A:%.*]], align 2, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 seq_cst seq_cst, align 2, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i16 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 [[LOADED]], i16 -1 seq_cst seq_cst, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i16, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i16, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1585,9 +1585,9 @@ entry:
 define void @atomic16_cas_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_cas_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 monotonic monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 monotonic monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 monotonic seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1600,9 +1600,9 @@ entry:
 define void @atomic16_cas_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_cas_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acquire monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acquire monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acquire seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1615,9 +1615,9 @@ entry:
 define void @atomic16_cas_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_cas_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 release monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 release monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 release seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1630,9 +1630,9 @@ entry:
 define void @atomic16_cas_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_cas_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acq_rel monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 acq_rel monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 acq_rel seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1645,9 +1645,9 @@ entry:
 define void @atomic16_cas_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic16_cas_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 seq_cst monotonic, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst acquire, align 2, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst seq_cst, align 2, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i16 0, i16 1 seq_cst monotonic, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst acquire, align 2, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i16 0, i16 1 seq_cst seq_cst, align 2, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1660,7 +1660,7 @@ entry:
 define i32 @atomic32_load_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_load_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] unordered, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] unordered, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
@@ -1671,7 +1671,7 @@ entry:
 define i32 @atomic32_load_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_load_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] monotonic, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] monotonic, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
@@ -1682,7 +1682,7 @@ entry:
 define i32 @atomic32_load_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_load_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] acquire, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] acquire, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
@@ -1693,7 +1693,7 @@ entry:
 define i32 @atomic32_load_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_load_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i32, ptr [[A:%.*]] seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
@@ -1704,7 +1704,7 @@ entry:
 define void @atomic32_store_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_store_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] unordered, align 4, !pcsections !0
+; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] unordered, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1715,7 +1715,7 @@ entry:
 define void @atomic32_store_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_store_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] monotonic, align 4, !pcsections !0
+; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] monotonic, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1726,7 +1726,7 @@ entry:
 define void @atomic32_store_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_store_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] release, align 4, !pcsections !0
+; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] release, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1737,7 +1737,7 @@ entry:
 define void @atomic32_store_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_store_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    store atomic i32 0, ptr [[A:%.*]] seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1748,14 +1748,14 @@ entry:
 define void @atomic32_xchg_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xchg_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1767,14 +1767,14 @@ entry:
 define void @atomic32_add_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_add_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1786,14 +1786,14 @@ entry:
 define void @atomic32_sub_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_sub_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1805,14 +1805,14 @@ entry:
 define void @atomic32_and_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_and_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1824,14 +1824,14 @@ entry:
 define void @atomic32_or_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_or_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1843,14 +1843,14 @@ entry:
 define void @atomic32_xor_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xor_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1862,14 +1862,14 @@ entry:
 define void @atomic32_nand_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_nand_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1881,14 +1881,14 @@ entry:
 define void @atomic32_xchg_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xchg_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1900,14 +1900,14 @@ entry:
 define void @atomic32_add_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_add_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1919,14 +1919,14 @@ entry:
 define void @atomic32_sub_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_sub_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1938,14 +1938,14 @@ entry:
 define void @atomic32_and_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_and_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1957,14 +1957,14 @@ entry:
 define void @atomic32_or_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_or_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1976,14 +1976,14 @@ entry:
 define void @atomic32_xor_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xor_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -1995,14 +1995,14 @@ entry:
 define void @atomic32_nand_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_nand_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2014,14 +2014,14 @@ entry:
 define void @atomic32_xchg_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xchg_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2033,14 +2033,14 @@ entry:
 define void @atomic32_add_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_add_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2052,14 +2052,14 @@ entry:
 define void @atomic32_sub_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_sub_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2071,14 +2071,14 @@ entry:
 define void @atomic32_and_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_and_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2090,14 +2090,14 @@ entry:
 define void @atomic32_or_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_or_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2109,14 +2109,14 @@ entry:
 define void @atomic32_xor_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xor_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2128,14 +2128,14 @@ entry:
 define void @atomic32_nand_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_nand_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2147,14 +2147,14 @@ entry:
 define void @atomic32_xchg_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xchg_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2166,14 +2166,14 @@ entry:
 define void @atomic32_add_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_add_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2185,14 +2185,14 @@ entry:
 define void @atomic32_sub_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_sub_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2204,14 +2204,14 @@ entry:
 define void @atomic32_and_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_and_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2223,14 +2223,14 @@ entry:
 define void @atomic32_or_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_or_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2242,14 +2242,14 @@ entry:
 define void @atomic32_xor_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xor_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2261,14 +2261,14 @@ entry:
 define void @atomic32_nand_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_nand_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2280,14 +2280,14 @@ entry:
 define void @atomic32_xchg_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xchg_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2299,14 +2299,14 @@ entry:
 define void @atomic32_add_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_add_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2318,14 +2318,14 @@ entry:
 define void @atomic32_sub_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_sub_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2337,14 +2337,14 @@ entry:
 define void @atomic32_and_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_and_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 0 seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2356,14 +2356,14 @@ entry:
 define void @atomic32_or_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_or_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2375,14 +2375,14 @@ entry:
 define void @atomic32_xor_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_xor_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 [[LOADED]] seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2394,14 +2394,14 @@ entry:
 define void @atomic32_nand_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_nand_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A:%.*]], align 4, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 seq_cst seq_cst, align 4, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 [[LOADED]], i32 -1 seq_cst seq_cst, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2413,9 +2413,9 @@ entry:
 define void @atomic32_cas_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_cas_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 monotonic monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 monotonic monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 monotonic seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2428,9 +2428,9 @@ entry:
 define void @atomic32_cas_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_cas_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acquire monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acquire monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acquire seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2443,9 +2443,9 @@ entry:
 define void @atomic32_cas_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_cas_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 release monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 release monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 release seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2458,9 +2458,9 @@ entry:
 define void @atomic32_cas_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_cas_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acq_rel monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 acq_rel monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 acq_rel seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2473,9 +2473,9 @@ entry:
 define void @atomic32_cas_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic32_cas_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 seq_cst monotonic, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst acquire, align 4, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst seq_cst, align 4, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i32 0, i32 1 seq_cst monotonic, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst acquire, align 4, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i32 0, i32 1 seq_cst seq_cst, align 4, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2488,7 +2488,7 @@ entry:
 define i64 @atomic64_load_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_load_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] unordered, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] unordered, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -2499,7 +2499,7 @@ entry:
 define i64 @atomic64_load_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_load_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] monotonic, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] monotonic, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -2510,7 +2510,7 @@ entry:
 define i64 @atomic64_load_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_load_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] acquire, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -2521,7 +2521,7 @@ entry:
 define i64 @atomic64_load_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_load_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic i64, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i64 [[TMP0]]
 ;
 entry:
@@ -2532,7 +2532,7 @@ entry:
 define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_load_seq_cst_ptr_ty(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load atomic ptr, ptr [[A:%.*]] seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load atomic ptr, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret ptr [[TMP0]]
 ;
 entry:
@@ -2543,7 +2543,7 @@ entry:
 define void @atomic64_store_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_store_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] unordered, align 8, !pcsections !0
+; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] unordered, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2554,7 +2554,7 @@ entry:
 define void @atomic64_store_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_store_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] monotonic, align 8, !pcsections !0
+; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] monotonic, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2565,7 +2565,7 @@ entry:
 define void @atomic64_store_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_store_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] release, align 8, !pcsections !0
+; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] release, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2576,7 +2576,7 @@ entry:
 define void @atomic64_store_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_store_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    store atomic i64 0, ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2587,7 +2587,7 @@ entry:
 define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_store_seq_cst_ptr_ty(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store atomic ptr [[V:%.*]], ptr [[A:%.*]] seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    store atomic ptr [[V:%.*]], ptr [[A:%.*]] seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -2598,14 +2598,14 @@ entry:
 define void @atomic64_xchg_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xchg_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2617,14 +2617,14 @@ entry:
 define void @atomic64_add_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_add_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2636,14 +2636,14 @@ entry:
 define void @atomic64_sub_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_sub_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2655,14 +2655,14 @@ entry:
 define void @atomic64_and_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_and_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2674,14 +2674,14 @@ entry:
 define void @atomic64_or_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_or_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2693,14 +2693,14 @@ entry:
 define void @atomic64_xor_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xor_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2712,14 +2712,14 @@ entry:
 define void @atomic64_nand_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_nand_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2731,14 +2731,14 @@ entry:
 define void @atomic64_xchg_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xchg_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2750,14 +2750,14 @@ entry:
 define void @atomic64_add_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_add_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2769,14 +2769,14 @@ entry:
 define void @atomic64_sub_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_sub_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2788,14 +2788,14 @@ entry:
 define void @atomic64_and_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_and_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2807,14 +2807,14 @@ entry:
 define void @atomic64_or_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_or_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2826,14 +2826,14 @@ entry:
 define void @atomic64_xor_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xor_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2845,14 +2845,14 @@ entry:
 define void @atomic64_nand_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_nand_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2864,14 +2864,14 @@ entry:
 define void @atomic64_xchg_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xchg_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2883,14 +2883,14 @@ entry:
 define void @atomic64_add_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_add_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2902,14 +2902,14 @@ entry:
 define void @atomic64_sub_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_sub_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2921,14 +2921,14 @@ entry:
 define void @atomic64_and_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_and_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2940,14 +2940,14 @@ entry:
 define void @atomic64_or_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_or_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2959,14 +2959,14 @@ entry:
 define void @atomic64_xor_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xor_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2978,14 +2978,14 @@ entry:
 define void @atomic64_nand_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_nand_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -2997,14 +2997,14 @@ entry:
 define void @atomic64_xchg_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xchg_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3016,14 +3016,14 @@ entry:
 define void @atomic64_add_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_add_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3035,14 +3035,14 @@ entry:
 define void @atomic64_sub_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_sub_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3054,14 +3054,14 @@ entry:
 define void @atomic64_and_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_and_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3073,14 +3073,14 @@ entry:
 define void @atomic64_or_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_or_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3092,14 +3092,14 @@ entry:
 define void @atomic64_xor_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xor_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3111,14 +3111,14 @@ entry:
 define void @atomic64_nand_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_nand_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3130,14 +3130,14 @@ entry:
 define void @atomic64_xchg_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xchg_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3149,14 +3149,14 @@ entry:
 define void @atomic64_add_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_add_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3168,14 +3168,14 @@ entry:
 define void @atomic64_sub_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_sub_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3187,14 +3187,14 @@ entry:
 define void @atomic64_and_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_and_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 0 seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3206,14 +3206,14 @@ entry:
 define void @atomic64_or_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_or_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3225,14 +3225,14 @@ entry:
 define void @atomic64_xor_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_xor_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 [[LOADED]] seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3244,14 +3244,14 @@ entry:
 define void @atomic64_nand_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_nand_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A:%.*]], align 8, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 [[LOADED]], i64 -1 seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3263,9 +3263,9 @@ entry:
 define void @atomic64_cas_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_cas_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 monotonic monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 monotonic monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 monotonic seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3278,9 +3278,9 @@ entry:
 define void @atomic64_cas_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_cas_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acquire monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acquire monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acquire seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3293,9 +3293,9 @@ entry:
 define void @atomic64_cas_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_cas_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 release monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 release monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 release seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3308,9 +3308,9 @@ entry:
 define void @atomic64_cas_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_cas_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acq_rel monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 acq_rel monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 acq_rel seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3323,9 +3323,9 @@ entry:
 define void @atomic64_cas_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_cas_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 seq_cst monotonic, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst acquire, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst seq_cst, align 8, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i64 0, i64 1 seq_cst monotonic, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst acquire, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A]], i64 0, i64 1 seq_cst seq_cst, align 8, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3338,14 +3338,14 @@ entry:
 define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) nounwind uwtable {
 ; CHECK-LABEL: @atomic64_cas_seq_cst_ptr_ty(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V1:%.*]] to i64, !pcsections !0
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[V2:%.*]] to i64, !pcsections !0
-; CHECK-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[A:%.*]], i64 [[TMP1]], i64 [[TMP2]] seq_cst seq_cst, align 8, !pcsections !0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP3]], 0, !pcsections !0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1, !pcsections !0
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP4]] to ptr, !pcsections !0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP6]], 0, !pcsections !0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { ptr, i1 } [[TMP7]], i1 [[TMP5]], 1, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[V1:%.*]] to i64, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V2:%.*]] to i64, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[A:%.*]], i64 [[TMP0]], i64 [[TMP1]] seq_cst seq_cst, align 8, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i1 } [[TMP2]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP3]] to ptr, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { ptr, i1 } poison, ptr [[TMP5]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { ptr, i1 } [[TMP6]], i1 [[TMP4]], 1, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -3356,8 +3356,8 @@ entry:
 define i128 @atomic128_load_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_load_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i128 [[LOADED]]
 ;
 entry:
@@ -3368,8 +3368,8 @@ entry:
 define i128 @atomic128_load_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_load_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i128 [[LOADED]]
 ;
 entry:
@@ -3380,8 +3380,8 @@ entry:
 define i128 @atomic128_load_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_load_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i128 [[LOADED]]
 ;
 entry:
@@ -3392,8 +3392,8 @@ entry:
 define i128 @atomic128_load_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_load_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[LOADED:%.*]] = extractvalue { i128, i1 } [[TMP0]], 0, !pcsections [[META0]]
 ; CHECK-NEXT:    ret i128 [[LOADED]]
 ;
 entry:
@@ -3404,14 +3404,14 @@ entry:
 define void @atomic128_store_unordered(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_store_unordered(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3423,14 +3423,14 @@ entry:
 define void @atomic128_store_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_store_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3442,14 +3442,14 @@ entry:
 define void @atomic128_store_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_store_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3461,14 +3461,14 @@ entry:
 define void @atomic128_store_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_store_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3480,14 +3480,14 @@ entry:
 define void @atomic128_xchg_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xchg_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3499,14 +3499,14 @@ entry:
 define void @atomic128_add_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_add_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3518,14 +3518,14 @@ entry:
 define void @atomic128_sub_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_sub_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3537,14 +3537,14 @@ entry:
 define void @atomic128_and_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_and_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3556,14 +3556,14 @@ entry:
 define void @atomic128_or_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_or_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3575,14 +3575,14 @@ entry:
 define void @atomic128_xor_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xor_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3594,14 +3594,14 @@ entry:
 define void @atomic128_nand_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_nand_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 monotonic monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 monotonic monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3613,14 +3613,14 @@ entry:
 define void @atomic128_xchg_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xchg_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3632,14 +3632,14 @@ entry:
 define void @atomic128_add_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_add_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3651,14 +3651,14 @@ entry:
 define void @atomic128_sub_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_sub_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3670,14 +3670,14 @@ entry:
 define void @atomic128_and_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_and_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3689,14 +3689,14 @@ entry:
 define void @atomic128_or_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_or_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3708,14 +3708,14 @@ entry:
 define void @atomic128_xor_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xor_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3727,14 +3727,14 @@ entry:
 define void @atomic128_nand_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_nand_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acquire acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acquire acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3746,14 +3746,14 @@ entry:
 define void @atomic128_xchg_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xchg_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3765,14 +3765,14 @@ entry:
 define void @atomic128_add_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_add_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3784,14 +3784,14 @@ entry:
 define void @atomic128_sub_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_sub_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3803,14 +3803,14 @@ entry:
 define void @atomic128_and_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_and_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3822,14 +3822,14 @@ entry:
 define void @atomic128_or_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_or_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3841,14 +3841,14 @@ entry:
 define void @atomic128_xor_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xor_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3860,14 +3860,14 @@ entry:
 define void @atomic128_nand_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_nand_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 release monotonic, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 release monotonic, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3879,14 +3879,14 @@ entry:
 define void @atomic128_xchg_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xchg_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3898,14 +3898,14 @@ entry:
 define void @atomic128_add_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_add_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3917,14 +3917,14 @@ entry:
 define void @atomic128_sub_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_sub_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3936,14 +3936,14 @@ entry:
 define void @atomic128_and_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_and_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3955,14 +3955,14 @@ entry:
 define void @atomic128_or_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_or_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3974,14 +3974,14 @@ entry:
 define void @atomic128_xor_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xor_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -3993,14 +3993,14 @@ entry:
 define void @atomic128_nand_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_nand_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acq_rel acquire, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 acq_rel acquire, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4012,14 +4012,14 @@ entry:
 define void @atomic128_xchg_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xchg_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4031,14 +4031,14 @@ entry:
 define void @atomic128_add_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_add_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4050,14 +4050,14 @@ entry:
 define void @atomic128_sub_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_sub_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4069,14 +4069,14 @@ entry:
 define void @atomic128_and_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_and_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 0 seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4088,14 +4088,14 @@ entry:
 define void @atomic128_or_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_or_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4107,14 +4107,14 @@ entry:
 define void @atomic128_xor_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_xor_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 [[LOADED]] seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4126,14 +4126,14 @@ entry:
 define void @atomic128_nand_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_nand_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections !0
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i128, ptr [[A:%.*]], align 16, !pcsections [[META0]]
+; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections !0
-; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 seq_cst seq_cst, align 16, !pcsections !0
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections !0
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections !0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections !0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i128 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ], !pcsections [[META0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = cmpxchg ptr [[A]], i128 [[LOADED]], i128 -1 seq_cst seq_cst, align 16, !pcsections [[META0]]
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i128, i1 } [[TMP1]], 1, !pcsections [[META0]]
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i128, i1 } [[TMP1]], 0, !pcsections [[META0]]
+; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]], !pcsections [[META0]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -4145,7 +4145,7 @@ entry:
 define void @atomic128_cas_monotonic(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_cas_monotonic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 monotonic monotonic, align 16, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 monotonic monotonic, align 16, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -4156,7 +4156,7 @@ entry:
 define void @atomic128_cas_acquire(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_cas_acquire(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acquire acquire, align 16, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acquire acquire, align 16, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -4167,7 +4167,7 @@ entry:
 define void @atomic128_cas_release(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_cas_release(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 release monotonic, align 16, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 release monotonic, align 16, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -4178,7 +4178,7 @@ entry:
 define void @atomic128_cas_acq_rel(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_cas_acq_rel(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acq_rel acquire, align 16, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 acq_rel acquire, align 16, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -4189,7 +4189,7 @@ entry:
 define void @atomic128_cas_seq_cst(ptr %a) nounwind uwtable {
 ; CHECK-LABEL: @atomic128_cas_seq_cst(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 seq_cst seq_cst, align 16, !pcsections !0
+; CHECK-NEXT:    [[TMP0:%.*]] = cmpxchg ptr [[A:%.*]], i128 0, i128 1 seq_cst seq_cst, align 16, !pcsections [[META0]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
index 78969839efcb8a..8fa41d0bc61eb8 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-mmra.ll
@@ -147,7 +147,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) #0 {
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !mmra [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index fc586a01e3bcf8..a8b54ac33d9042 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -166,7 +166,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX908-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX908-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX908:       atomicrmw.phi:
-; GFX908-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX908-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX908:       atomicrmw.end:
 ; GFX908-NEXT:    ret void
@@ -192,7 +191,6 @@ define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
 ; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VAL]] syncscope("workgroup") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_PHI]]
 ; GFX90A:       atomicrmw.phi:
-; GFX90A-NEXT:    [[RES:%.*]] = phi float [ [[TMP2]], [[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], [[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], [[ATOMICRMW_GLOBAL]] ]
 ; GFX90A-NEXT:    br label [[ATOMICRMW_END:%.*]]
 ; GFX90A:       atomicrmw.end:
 ; GFX90A-NEXT:    ret void
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
index 9c4ce50da69170..cc4104df5d725d 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/atomicrmw-fp.ll
@@ -7,18 +7,18 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] monotonic monotonic
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
@@ -30,18 +30,18 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] monotonic monotonic
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll b/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
index 23aa57e18ecc5a..8195a5b6145e3a 100644
--- a/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
+++ b/llvm/test/Transforms/AtomicExpand/ARM/cmpxchg-weak.ll
@@ -1,169 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -passes=atomic-expand -codegen-opt-level=1 -S -mtriple=thumbv7s-apple-ios7.0 %s | FileCheck %s
 
-define i32 @test_cmpxchg_seq_cst(ptr %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: @test_cmpxchg_seq_cst
 ; Intrinsic for "dmb ishst" is then expected
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[FENCED_STORE:.*]], label %[[NO_STORE_BB:.*]]
-
-; CHECK: [[FENCED_STORE]]:
-; CHECK:     call void @llvm.arm.dmb(i32 10)
-; CHECK:     br label %[[TRY_STORE:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[LOADED]], %[[FENCED_STORE]] ]
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0(i32 %new, ptr elementtype(i32) %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK:     call void @llvm.arm.dmb(i32 11)
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[NO_STORE_BB]]:
-; CHECK:     [[LOADED_NOSTORE:%.*]] = phi i32 [ [[LOADED]], %[[START]] ]
-; CHECK:     call void @llvm.arm.clrex()
-; CHECK:     br label %[[FAILURE_BB]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK:     [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[NO_STORE_BB]] ], [ [[LOADED_TRYSTORE]], %[[TRY_STORE]] ]
-; CHECK:     call void @llvm.arm.dmb(i32 11)
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[SUCCESS_BB]] ], [ [[LOADED_FAILURE]], %[[FAILURE_BB]] ]
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i32 [[LOADED_EXIT]]
-
+define i32 @test_cmpxchg_seq_cst(ptr %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: define i32 @test_cmpxchg_seq_cst(
+; CHECK-SAME: ptr [[ADDR:%.*]], i32 [[DESIRED:%.*]], i32 [[NEW:%.*]]) {
+; CHECK-NEXT:    br label %[[CMPXCHG_START:.*]]
+; CHECK:       [[CMPXCHG_START]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SHOULD_STORE:%.*]] = icmp eq i32 [[TMP1]], [[DESIRED]]
+; CHECK-NEXT:    br i1 [[SHOULD_STORE]], label %[[CMPXCHG_FENCEDSTORE:.*]], label %[[CMPXCHG_NOSTORE:.*]]
+; CHECK:       [[CMPXCHG_FENCEDSTORE]]:
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 10)
+; CHECK-NEXT:    br label %[[CMPXCHG_TRYSTORE:.*]]
+; CHECK:       [[CMPXCHG_TRYSTORE]]:
+; CHECK-NEXT:    [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_FENCEDSTORE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.strex.p0(i32 [[NEW]], ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SUCCESS:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label %[[CMPXCHG_SUCCESS:.*]], label %[[CMPXCHG_FAILURE:.*]]
+; CHECK:       [[CMPXCHG_RELEASEDLOAD:.*:]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[CMPXCHG_SUCCESS]]:
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
+; CHECK-NEXT:    br label %[[CMPXCHG_END:.*]]
+; CHECK:       [[CMPXCHG_NOSTORE]]:
+; CHECK-NEXT:    [[LOADED_NOSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_START]] ]
+; CHECK-NEXT:    call void @llvm.arm.clrex()
+; CHECK-NEXT:    br label %[[CMPXCHG_FAILURE]]
+; CHECK:       [[CMPXCHG_FAILURE]]:
+; CHECK-NEXT:    [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[CMPXCHG_NOSTORE]] ], [ [[LOADED_TRYSTORE]], %[[CMPXCHG_TRYSTORE]] ]
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
+; CHECK-NEXT:    br label %[[CMPXCHG_END]]
+; CHECK:       [[CMPXCHG_END]]:
+; CHECK-NEXT:    [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[CMPXCHG_SUCCESS]] ], [ [[LOADED_FAILURE]], %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    [[SUCCESS1:%.*]] = phi i1 [ true, %[[CMPXCHG_SUCCESS]] ], [ false, %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    ret i32 [[LOADED_EXIT]]
+;
   %pair = cmpxchg weak ptr %addr, i32 %desired, i32 %new seq_cst seq_cst
   %oldval = extractvalue { i32, i1 } %pair, 0
   ret i32 %oldval
 }
 
 define i1 @test_cmpxchg_weak_fail(ptr %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: @test_cmpxchg_weak_fail
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[FENCED_STORE:.*]], label %[[NO_STORE_BB:.*]]
-
-; CHECK: [[FENCED_STORE]]:
-; CHECK:     call void @llvm.arm.dmb(i32 10)
-; CHECK:     br label %[[TRY_STORE:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0(i32 %new, ptr elementtype(i32) %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK:     call void @llvm.arm.dmb(i32 11)
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[NO_STORE_BB]]:
-; CHECK:     call void @llvm.arm.clrex()
-; CHECK:     br label %[[FAILURE_BB]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK-NOT: dmb
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i1 [[SUCCESS]]
-
+; CHECK-LABEL: define i1 @test_cmpxchg_weak_fail(
+; CHECK-SAME: ptr [[ADDR:%.*]], i32 [[DESIRED:%.*]], i32 [[NEW:%.*]]) {
+; CHECK-NEXT:    br label %[[CMPXCHG_START:.*]]
+; CHECK:       [[CMPXCHG_START]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SHOULD_STORE:%.*]] = icmp eq i32 [[TMP1]], [[DESIRED]]
+; CHECK-NEXT:    br i1 [[SHOULD_STORE]], label %[[CMPXCHG_FENCEDSTORE:.*]], label %[[CMPXCHG_NOSTORE:.*]]
+; CHECK:       [[CMPXCHG_FENCEDSTORE]]:
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 10)
+; CHECK-NEXT:    br label %[[CMPXCHG_TRYSTORE:.*]]
+; CHECK:       [[CMPXCHG_TRYSTORE]]:
+; CHECK-NEXT:    [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_FENCEDSTORE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.strex.p0(i32 [[NEW]], ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SUCCESS:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label %[[CMPXCHG_SUCCESS:.*]], label %[[CMPXCHG_FAILURE:.*]]
+; CHECK:       [[CMPXCHG_RELEASEDLOAD:.*:]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[CMPXCHG_SUCCESS]]:
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
+; CHECK-NEXT:    br label %[[CMPXCHG_END:.*]]
+; CHECK:       [[CMPXCHG_NOSTORE]]:
+; CHECK-NEXT:    [[LOADED_NOSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_START]] ]
+; CHECK-NEXT:    call void @llvm.arm.clrex()
+; CHECK-NEXT:    br label %[[CMPXCHG_FAILURE]]
+; CHECK:       [[CMPXCHG_FAILURE]]:
+; CHECK-NEXT:    [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[CMPXCHG_NOSTORE]] ], [ [[LOADED_TRYSTORE]], %[[CMPXCHG_TRYSTORE]] ]
+; CHECK-NEXT:    br label %[[CMPXCHG_END]]
+; CHECK:       [[CMPXCHG_END]]:
+; CHECK-NEXT:    [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[CMPXCHG_SUCCESS]] ], [ [[LOADED_FAILURE]], %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    [[SUCCESS1:%.*]] = phi i1 [ true, %[[CMPXCHG_SUCCESS]] ], [ false, %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    ret i1 [[SUCCESS1]]
+;
   %pair = cmpxchg weak ptr %addr, i32 %desired, i32 %new seq_cst monotonic
   %oldval = extractvalue { i32, i1 } %pair, 1
   ret i1 %oldval
 }
 
 define i32 @test_cmpxchg_monotonic(ptr %addr, i32 %desired, i32 %new) {
-; CHECK-LABEL: @test_cmpxchg_monotonic
-; CHECK-NOT: dmb
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[FENCED_STORE:.*]], label %[[NO_STORE_BB:.*]]
-
-; CHECK: [[FENCED_STORE]]:
-; CHECK-NEXT: br label %[[TRY_STORE]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[LOADED]], %[[FENCED_STORE]] ]
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0(i32 %new, ptr elementtype(i32) %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK-NOT: dmb
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[NO_STORE_BB]]:
-; CHECK:     [[LOADED_NOSTORE:%.*]] = phi i32 [ [[LOADED]], %[[START]] ]
-; CHECK:     call void @llvm.arm.clrex()
-; CHECK:     br label %[[FAILURE_BB]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK:     [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[NO_STORE_BB]] ], [ [[LOADED_TRYSTORE]], %[[TRY_STORE]] ]
-; CHECK-NOT: dmb
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[SUCCESS_BB]] ], [ [[LOADED_FAILURE]], %[[FAILURE_BB]] ]
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i32 [[LOADED_EXIT]]
-
+; CHECK-LABEL: define i32 @test_cmpxchg_monotonic(
+; CHECK-SAME: ptr [[ADDR:%.*]], i32 [[DESIRED:%.*]], i32 [[NEW:%.*]]) {
+; CHECK-NEXT:    br label %[[CMPXCHG_START:.*]]
+; CHECK:       [[CMPXCHG_START]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SHOULD_STORE:%.*]] = icmp eq i32 [[TMP1]], [[DESIRED]]
+; CHECK-NEXT:    br i1 [[SHOULD_STORE]], label %[[CMPXCHG_FENCEDSTORE:.*]], label %[[CMPXCHG_NOSTORE:.*]]
+; CHECK:       [[CMPXCHG_FENCEDSTORE]]:
+; CHECK-NEXT:    br label %[[CMPXCHG_TRYSTORE:.*]]
+; CHECK:       [[CMPXCHG_TRYSTORE]]:
+; CHECK-NEXT:    [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_FENCEDSTORE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.strex.p0(i32 [[NEW]], ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SUCCESS:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label %[[CMPXCHG_SUCCESS:.*]], label %[[CMPXCHG_FAILURE:.*]]
+; CHECK:       [[CMPXCHG_RELEASEDLOAD:.*:]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[CMPXCHG_SUCCESS]]:
+; CHECK-NEXT:    br label %[[CMPXCHG_END:.*]]
+; CHECK:       [[CMPXCHG_NOSTORE]]:
+; CHECK-NEXT:    [[LOADED_NOSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_START]] ]
+; CHECK-NEXT:    call void @llvm.arm.clrex()
+; CHECK-NEXT:    br label %[[CMPXCHG_FAILURE]]
+; CHECK:       [[CMPXCHG_FAILURE]]:
+; CHECK-NEXT:    [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[CMPXCHG_NOSTORE]] ], [ [[LOADED_TRYSTORE]], %[[CMPXCHG_TRYSTORE]] ]
+; CHECK-NEXT:    br label %[[CMPXCHG_END]]
+; CHECK:       [[CMPXCHG_END]]:
+; CHECK-NEXT:    [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[CMPXCHG_SUCCESS]] ], [ [[LOADED_FAILURE]], %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    [[SUCCESS1:%.*]] = phi i1 [ true, %[[CMPXCHG_SUCCESS]] ], [ false, %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    ret i32 [[LOADED_EXIT]]
+;
   %pair = cmpxchg weak ptr %addr, i32 %desired, i32 %new monotonic monotonic
   %oldval = extractvalue { i32, i1 } %pair, 0
   ret i32 %oldval
 }
 
 define i32 @test_cmpxchg_seq_cst_minsize(ptr %addr, i32 %desired, i32 %new) minsize {
-; CHECK-LABEL: @test_cmpxchg_seq_cst_minsize
-; CHECK:     br label %[[START:.*]]
-
-; CHECK: [[START]]:
-; CHECK:     [[LOADED:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) %addr)
-; CHECK:     [[SHOULD_STORE:%.*]] = icmp eq i32 [[LOADED]], %desired
-; CHECK:     br i1 [[SHOULD_STORE]], label %[[FENCED_STORE:.*]], label %[[NO_STORE_BB:.*]]
-
-; CHECK: [[FENCED_STORE]]:
-; CHECK:     call void @llvm.arm.dmb(i32 10)
-; CHECK:     br label %[[TRY_STORE:.*]]
-
-; CHECK: [[TRY_STORE]]:
-; CHECK:     [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[LOADED]], %[[FENCED_STORE]] ]
-; CHECK:     [[STREX:%.*]] = call i32 @llvm.arm.strex.p0(i32 %new, ptr elementtype(i32) %addr)
-; CHECK:     [[SUCCESS:%.*]] = icmp eq i32 [[STREX]], 0
-; CHECK:     br i1 [[SUCCESS]], label %[[SUCCESS_BB:.*]], label %[[FAILURE_BB:.*]]
-
-; CHECK: [[SUCCESS_BB]]:
-; CHECK:     call void @llvm.arm.dmb(i32 11)
-; CHECK:     br label %[[END:.*]]
-
-; CHECK: [[NO_STORE_BB]]:
-; CHECK:     [[LOADED_NOSTORE:%.*]] = phi i32 [ [[LOADED]], %[[START]] ]
-; CHECK:     call void @llvm.arm.clrex()
-; CHECK:     br label %[[FAILURE_BB]]
-
-; CHECK: [[FAILURE_BB]]:
-; CHECK:     [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[NO_STORE_BB]] ], [ [[LOADED_TRYSTORE]], %[[TRY_STORE]] ]
-; CHECK:     call void @llvm.arm.dmb(i32 11)
-; CHECK:     br label %[[END]]
-
-; CHECK: [[END]]:
-; CHECK:     [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[SUCCESS_BB]] ], [ [[LOADED_FAILURE]], %[[FAILURE_BB]] ]
-; CHECK:     [[SUCCESS:%.*]] = phi i1 [ true, %[[SUCCESS_BB]] ], [ false, %[[FAILURE_BB]] ]
-; CHECK:     ret i32 [[LOADED_EXIT]]
-
+; CHECK-LABEL: define i32 @test_cmpxchg_seq_cst_minsize(
+; CHECK-SAME: ptr [[ADDR:%.*]], i32 [[DESIRED:%.*]], i32 [[NEW:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    br label %[[CMPXCHG_START:.*]]
+; CHECK:       [[CMPXCHG_START]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.ldrex.p0(ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SHOULD_STORE:%.*]] = icmp eq i32 [[TMP1]], [[DESIRED]]
+; CHECK-NEXT:    br i1 [[SHOULD_STORE]], label %[[CMPXCHG_FENCEDSTORE:.*]], label %[[CMPXCHG_NOSTORE:.*]]
+; CHECK:       [[CMPXCHG_FENCEDSTORE]]:
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 10)
+; CHECK-NEXT:    br label %[[CMPXCHG_TRYSTORE:.*]]
+; CHECK:       [[CMPXCHG_TRYSTORE]]:
+; CHECK-NEXT:    [[LOADED_TRYSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_FENCEDSTORE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.arm.strex.p0(i32 [[NEW]], ptr elementtype(i32) [[ADDR]])
+; CHECK-NEXT:    [[SUCCESS:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[SUCCESS]], label %[[CMPXCHG_SUCCESS:.*]], label %[[CMPXCHG_FAILURE:.*]]
+; CHECK:       [[CMPXCHG_RELEASEDLOAD:.*:]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[CMPXCHG_SUCCESS]]:
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
+; CHECK-NEXT:    br label %[[CMPXCHG_END:.*]]
+; CHECK:       [[CMPXCHG_NOSTORE]]:
+; CHECK-NEXT:    [[LOADED_NOSTORE:%.*]] = phi i32 [ [[TMP1]], %[[CMPXCHG_START]] ]
+; CHECK-NEXT:    call void @llvm.arm.clrex()
+; CHECK-NEXT:    br label %[[CMPXCHG_FAILURE]]
+; CHECK:       [[CMPXCHG_FAILURE]]:
+; CHECK-NEXT:    [[LOADED_FAILURE:%.*]] = phi i32 [ [[LOADED_NOSTORE]], %[[CMPXCHG_NOSTORE]] ], [ [[LOADED_TRYSTORE]], %[[CMPXCHG_TRYSTORE]] ]
+; CHECK-NEXT:    call void @llvm.arm.dmb(i32 11)
+; CHECK-NEXT:    br label %[[CMPXCHG_END]]
+; CHECK:       [[CMPXCHG_END]]:
+; CHECK-NEXT:    [[LOADED_EXIT:%.*]] = phi i32 [ [[LOADED_TRYSTORE]], %[[CMPXCHG_SUCCESS]] ], [ [[LOADED_FAILURE]], %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    [[SUCCESS1:%.*]] = phi i1 [ true, %[[CMPXCHG_SUCCESS]] ], [ false, %[[CMPXCHG_FAILURE]] ]
+; CHECK-NEXT:    ret i32 [[LOADED_EXIT]]
+;
   %pair = cmpxchg weak ptr %addr, i32 %desired, i32 %new seq_cst seq_cst
   %oldval = extractvalue { i32, i1 } %pair, 0
   ret i32 %oldval
diff --git a/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
index 9e64db0a5e31d3..7649e7116b3899 100644
--- a/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/Hexagon/atomicrmw-fp.ll
@@ -6,15 +6,15 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LARX:%.*]] = call i32 @llvm.hexagon.L2.loadw.locked(ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[LARX]] to float
-; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[TMP2]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[STCX:%.*]] = call i32 @llvm.hexagon.S2.storew.locked(ptr [[PTR]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[STCX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    br i1 [[TMP5]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[LARX]] to float
+; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[TMP1]], [[VALUE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[STCX:%.*]] = call i32 @llvm.hexagon.S2.storew.locked(ptr [[PTR]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[STCX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    br i1 [[TMP3]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP2]]
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
@@ -25,15 +25,15 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LARX:%.*]] = call i32 @llvm.hexagon.L2.loadw.locked(ptr [[PTR:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32 [[LARX]] to float
-; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[TMP2]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[STCX:%.*]] = call i32 @llvm.hexagon.S2.storew.locked(ptr [[PTR]], i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[STCX]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    br i1 [[TMP5]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[LARX]] to float
+; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[TMP1]], [[VALUE:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[STCX:%.*]] = call i32 @llvm.hexagon.S2.storew.locked(ptr [[PTR]], i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[STCX]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    br i1 [[TMP3]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP2]]
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
index 0fd1cbe8bdd0e9..3244683916bc00 100644
--- a/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/Mips/atomicrmw-fp.ll
@@ -7,18 +7,18 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] monotonic monotonic
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    fence seq_cst
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
@@ -30,18 +30,18 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] monotonic monotonic
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    fence seq_cst
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
index a3d62e06a7cd60..f787aa7f6a42be 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/atomicrmw-fp.ll
@@ -7,18 +7,18 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] monotonic monotonic, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
@@ -30,18 +30,18 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] monotonic monotonic, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] monotonic monotonic, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    call void @llvm.ppc.lwsync()
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
diff --git a/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll b/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
index 7e41583189c3d3..2cbb1794b43407 100644
--- a/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/RISCV/atomicrmw-fp.ll
@@ -11,14 +11,14 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP1]])
 ; CHECK-NEXT:    store float [[LOADED]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange_4(ptr [[PTR]], ptr [[TMP1]], i32 [[TMP5]], i32 5, i32 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call zeroext i1 @__atomic_compare_exchange_4(ptr [[PTR]], ptr [[TMP1]], i32 [[TMP3]], i32 5, i32 5)
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { float, i1 } poison, float [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { float, i1 } [[TMP8]], i1 [[TMP6]], 1
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { float, i1 } [[TMP9]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { float, i1 } [[TMP9]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { float, i1 } poison, float [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { float, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { float, i1 } [[TMP7]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { float, i1 } [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret float [[NEWLOADED]]
@@ -37,14 +37,14 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TMP1]])
 ; CHECK-NEXT:    store float [[LOADED]], ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = call zeroext i1 @__atomic_compare_exchange_4(ptr [[PTR]], ptr [[TMP1]], i32 [[TMP5]], i32 5, i32 5)
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call zeroext i1 @__atomic_compare_exchange_4(ptr [[PTR]], ptr [[TMP1]], i32 [[TMP3]], i32 5, i32 5)
+; CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = insertvalue { float, i1 } poison, float [[TMP7]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = insertvalue { float, i1 } [[TMP8]], i1 [[TMP6]], 1
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { float, i1 } [[TMP9]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { float, i1 } [[TMP9]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertvalue { float, i1 } poison, float [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { float, i1 } [[TMP6]], i1 [[TMP4]], 1
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { float, i1 } [[TMP7]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { float, i1 } [[TMP7]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    ret float [[NEWLOADED]]
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
index 3866530abb7966..024b2e33af5dad 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-rmw-fp.ll
@@ -6,17 +6,17 @@ define float @test_atomicrmw_fadd_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, float %value seq_cst
   ret float %res
@@ -27,17 +27,17 @@ define double @test_atomicrmw_fadd_f64(ptr %ptr, double %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[NEW]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret double [[TMP6]]
+; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %res = atomicrmw fadd ptr %ptr, double %value seq_cst
   ret double %res
@@ -48,17 +48,17 @@ define float @test_atomicrmw_fadd_f32_as1(ptr addrspace(1) %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fadd ptr addrspace(1) %ptr, float %value seq_cst
   ret float %res
@@ -69,17 +69,17 @@ define float @test_atomicrmw_fsub_f32(ptr %ptr, float %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub float [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[NEW]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i32 [[NEWLOADED]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    ret float [[TMP5]]
 ;
   %res = atomicrmw fsub ptr %ptr, float %value seq_cst
   ret float %res
@@ -90,17 +90,17 @@ define double @test_atomicrmw_fsub_f64(ptr %ptr, double %value) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
 ; CHECK-NEXT:    [[NEW:%.*]] = fsub double [[LOADED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[NEW]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] seq_cst seq_cst
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; CHECK-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    ret double [[TMP6]]
+; CHECK-NEXT:    ret double [[TMP5]]
 ;
   %res = atomicrmw fsub ptr %ptr, double %value seq_cst
   ret double %res
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
index 211c6c5886413a..c88671e3bb407f 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-xchg-fp.ll
@@ -3,17 +3,17 @@
 
 define double @atomic_xchg_f64(ptr %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT:    ret double [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
   %result = atomicrmw xchg ptr %ptr, double 4.0 seq_cst
   ret double %result
@@ -21,17 +21,17 @@ define double @atomic_xchg_f64(ptr %ptr) nounwind {
 
 define double @atomic_xchg_f64_as1(ptr addrspace(1) %ptr) nounwind {
 ; CHECK-LABEL: @atomic_xchg_f64_as1(
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr addrspace(1) [[PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR:%.*]], align 8
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP2]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP3]], 0
+; CHECK-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 4616189618054758400 seq_cst seq_cst, align 8
+; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[NEWLOADED]] to double
-; CHECK-NEXT:    ret double [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64 [[NEWLOADED]] to double
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
   %result = atomicrmw xchg ptr addrspace(1) %ptr, double 4.0 seq_cst
   ret double %result
diff --git a/llvm/test/Transforms/GlobalMerge/private-global.ll b/llvm/test/Transforms/GlobalMerge/private-global.ll
new file mode 100644
index 00000000000000..c4152a242d59fb
--- /dev/null
+++ b/llvm/test/Transforms/GlobalMerge/private-global.ll
@@ -0,0 +1,36 @@
+; RUN: opt -global-merge -global-merge-max-offset=100 -S -o - %s | FileCheck %s
+; RUN: opt -passes='global-merge<max-offset=100>' -S -o - %s | FileCheck %s
+
+; NOTE: This is a copy of the llvm/test/Transforms/GlobalMerge/basic.ll test,
+; using `private` global variables instead of `internal`. This is to show that
+; that private globals can be merged in the GlobalMerge pass.
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: @_MergedGlobals = private global <{ i32, i32 }> <{ i32 1, i32 2 }>, align 4
+; CHECK: @_MergedGlobals.1 = private global <{ i32, i32 }> <{ i32 3, i32 4 }>, section "foo", align 4
+
+; CHECK-DAG: @a = private alias i32, ptr @_MergedGlobals{{$}}
+@a = private global i32 1
+
+; CHECK-DAG: @b = private alias i32, getelementptr inbounds (<{ i32, i32 }>, ptr @_MergedGlobals, i32 0, i32 1)
+@b = private global i32 2
+
+; CHECK-DAG: @c = private alias i32, ptr @_MergedGlobals.1{{$}}
+@c = private global i32 3, section "foo"
+
+; CHECK-DAG: @d = private alias i32, getelementptr inbounds (<{ i32, i32 }>, ptr @_MergedGlobals.1, i32 0, i32 1)
+@d = private global i32 4, section "foo"
+
+define void @use_private() {
+  ; CHECK: load i32, ptr @_MergedGlobals,
+  %x = load i32, ptr @a
+  ; CHECK: load i32, ptr getelementptr inbounds (<{ i32, i32 }>, ptr @_MergedGlobals, i32 0, i32 1)
+  %y = load i32, ptr @b
+  ; CHECK: load i32, ptr @_MergedGlobals.1
+  %z1 = load i32, ptr @c
+  ; CHECK: load i32, ptr getelementptr inbounds (<{ i32, i32 }>, ptr @_MergedGlobals.1, i32 0, i32 1)
+  %z2 = load i32, ptr @d
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
new file mode 100644
index 00000000000000..637b985b4562ed
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/X86/gep-use-outside-loop.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-vectorize -mtriple=x86_64-apple-macosx -mcpu=skylake-avx512 -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/102934.
+define void @gep_use_in_dead_block(ptr noalias %dst, ptr %src) {
+; CHECK-LABEL: define void @gep_use_in_dead_block(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], <i16 10, i16 10, i16 10, i16 10>
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i1> [[TMP6]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP12]], i32 2, <4 x i1> [[TMP7]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP_SRC]], align 2
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 10
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[DEAD:.*]]:
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    br label %[[DEAD]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 99
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.src = getelementptr i16, ptr %src, i64 %iv
+  %l = load i16, ptr %gep.src
+  %c = icmp eq i16 %l, 10
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+  store i16 0, ptr %gep.dst, align 2
+  br label %loop.latch
+
+dead:
+  store i16 0, ptr %gep.dst, align 2
+  br label %dead
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 99
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @gep_use_outside_loop(ptr noalias %dst, ptr %src) {
+; CHECK-LABEL: define void @gep_use_outside_loop(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[DST]], <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i16> [[WIDE_LOAD]], <i16 10, i16 10, i16 10, i16 10>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i16, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i16.p0(<4 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 96
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x ptr> [[TMP1]], i32 3
+; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
+; CHECK:       [[LOOP_HEADER]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[GEP_SRC]], align 2
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i16 [[L]], 10
+; CHECK-NEXT:    br i1 [[C]], label %[[LOOP_LATCH]], label %[[THEN:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    br label %[[LOOP_LATCH]]
+; CHECK:       [[LOOP_LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 99
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[GEP_DST_LCSSA:%.*]] = phi ptr [ [[GEP_DST]], %[[LOOP_LATCH]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST_LCSSA]], align 2
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep.dst = getelementptr i16, ptr %dst, i64 %iv
+  %gep.src = getelementptr i16, ptr %src, i64 %iv
+  %l = load i16, ptr %gep.src
+  %c = icmp eq i16 %l, 10
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  store i16 0, ptr %gep.dst, align 2
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 99
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  store i16 0, ptr %gep.dst, align 2
+  ret void
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
index db5a7105fd8c4d..f55e37c7772609 100644
--- a/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/skeleton-lcssa-crash.ll
@@ -29,13 +29,13 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[L_2_LCSSA]], i64 2
-; CHECK-NEXT:    [[UGLYGEP5:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 2
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[L_2_LCSSA]], i64 2
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[N]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 4
-; CHECK-NEXT:    [[UGLYGEP6:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 [[TMP2]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[L_2_LCSSA]], [[UGLYGEP6]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[UGLYGEP5]], [[UGLYGEP]]
+; CHECK-NEXT:    [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[L_1_LCSSA]], i64 [[TMP2]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[L_2_LCSSA]], [[SCEVGEP6]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP5]], [[SCEVGEP]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -48,10 +48,10 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[L_1]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2, !alias.scope !0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP6]], align 2, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[L_2]], i64 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP7]], align 2, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    store i16 [[TMP8]], ptr [[TMP7]], align 2, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -74,7 +74,7 @@ define i16 @test(ptr %arg, i64 %N) {
 ; CHECK-NEXT:    [[LOOP_L_1:%.*]] = load i16, ptr [[GEP_1]], align 2
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i16, ptr [[L_2_LCSSA]], i64 0
 ; CHECK-NEXT:    store i16 [[LOOP_L_1]], ptr [[GEP_2]], align 2
-; CHECK-NEXT:    br i1 [[C_5]], label [[LOOP_3]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_5]], label [[LOOP_3]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit.loopexit:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit.loopexit1:
@@ -138,31 +138,17 @@ define void @test2(ptr %dst) {
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_2]], label [[LOOP_3_PH:%.*]]
 ; CHECK:       loop.3.ph:
-; CHECK-NEXT:    [[INDVAR_LCSSA1:%.*]] = phi i32 [ [[INDVAR]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[INDVAR_LCSSA:%.*]] = phi i32 [ [[INDVAR]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[IV_1_LCSSA:%.*]] = phi i64 [ [[IV_1]], [[LOOP_2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = and i64 [[IV_1_LCSSA]], 4294967295
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[INDVAR_LCSSA1]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[INDVAR_LCSSA]], -1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 1000
-; CHECK-NEXT:    [[SMIN2:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 1)
-; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[SMIN2]]
+; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP2]], i32 1)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[SMIN]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP5]], 2
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK:       vector.scevcheck:
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[INDVAR_LCSSA]], -1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 1000
-; CHECK-NEXT:    [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP7]], i32 1)
-; CHECK-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], [[SMIN]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP6]], 999
-; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP8]])
-; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP9]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i32 [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP5]], [[N_MOD_VF]]
@@ -171,21 +157,21 @@ define void @test2(ptr %dst) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i64 [[TMP13]], -1
-; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], 4294967295
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 -1
-; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP18]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[TMP7]], 4294967295
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 -1
+; CHECK-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP5]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOP_1_LATCH:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[LOOP_3_PH]] ], [ [[TMP0]], [[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[LOOP_3_PH]] ]
 ; CHECK-NEXT:    br label [[LOOP_3:%.*]]
 ; CHECK:       loop.3:
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_3]] ]
@@ -195,7 +181,7 @@ define void @test2(ptr %dst) {
 ; CHECK-NEXT:    store i32 0, ptr [[GEP_DST]], align 4
 ; CHECK-NEXT:    [[IV_2_TRUNC:%.*]] = trunc i64 [[IV_2]] to i32
 ; CHECK-NEXT:    [[EC:%.*]] = icmp sgt i32 [[IV_2_TRUNC]], 1
-; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_3]], label [[LOOP_1_LATCH]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label [[LOOP_3]], label [[LOOP_1_LATCH]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       loop.1.latch:
 ; CHECK-NEXT:    [[C_2:%.*]] = call i1 @cond()
 ; CHECK-NEXT:    br i1 [[C_2]], label [[EXIT:%.*]], label [[LOOP_1_HEADER]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/const-in-different-functions.ll b/llvm/test/Transforms/SLPVectorizer/X86/const-in-different-functions.ll
new file mode 100644
index 00000000000000..2e473f4f2c213c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/const-in-different-functions.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=x86_64 -passes=slp-vectorizer < %s | FileCheck %s
+
+; Test that SLP vectorize doesn't crash if a stored constant is used in multiple
+; functions.
+
+@p = external global [64 x float]
+
+define void @_Z1hPfl() {
+; CHECK-LABEL: define void @_Z1hPfl() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr @p, i64 28
+; CHECK-NEXT:    store <2 x float> <float 0.000000e+00, float 1.000000e+00>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = getelementptr i8, ptr @p, i64 28
+  store float 0.000000e+00, ptr %0, align 4
+  %1 = getelementptr i8, ptr @p, i64 32
+  store float 1.000000e+00, ptr %1, align 16
+  ret void
+}
+
+define void @_Z1mv(i64 %arrayidx4.i.2.idx) {
+; CHECK-LABEL: define void @_Z1mv(
+; CHECK-SAME: i64 [[ARRAYIDX4_I_2_IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_COND1_PREHEADER_LR_PH_I:.*:]]
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER_I:.*]]
+; CHECK:       [[FOR_COND1_PREHEADER_I]]:
+; CHECK-NEXT:    store float 1.000000e+00, ptr @p, align 4
+; CHECK-NEXT:    [[ARRAYIDX4_I_2:%.*]] = getelementptr i8, ptr @p, i64 [[ARRAYIDX4_I_2_IDX]]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX4_I_2]], align 4
+; CHECK-NEXT:    br label %[[FOR_COND1_PREHEADER_I]]
+;
+entry:
+  ret void
+
+for.cond1.preheader.lr.ph.i:                      ; No predecessors!
+  br label %for.cond1.preheader.i
+
+for.cond1.preheader.i:                            ; preds = %for.cond1.preheader.i, %for.cond1.preheader.lr.ph.i
+  store float 1.000000e+00, ptr @p, align 4
+  %arrayidx4.i.2 = getelementptr i8, ptr @p, i64 %arrayidx4.i.2.idx
+  store float 0.000000e+00, ptr %arrayidx4.i.2, align 4
+  br label %for.cond1.preheader.i
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
new file mode 100644
index 00000000000000..6028a8b918941c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer,instcombine -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 %s | FileCheck %s
+
+define void @test1(ptr %in, ptr %out) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <8 x i32> [[TMP0]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i32>, ptr %in, align 1
+  %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i32> %1 to <4 x i64>
+  %4 = zext <4 x i32> %2 to <4 x i64>
+  %5 = shufflevector <4 x i64> %3, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %6 = shufflevector <4 x i64> %3, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %7 = shufflevector <4 x i64> %4, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %8 = shufflevector <4 x i64> %4, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %9 = getelementptr inbounds i64, ptr %out, i64 0
+  %10 = getelementptr inbounds i64, ptr %out, i64 2
+  %11 = getelementptr inbounds i64, ptr %out, i64 4
+  %12 = getelementptr inbounds i64, ptr %out, i64 6
+  store <2 x i64> %5, ptr %9, align 8
+  store <2 x i64> %6, ptr %10, align 8
+  store <2 x i64> %7, ptr %11, align 8
+  store <2 x i64> %8, ptr %12, align 8
+  ret void
+}
+
+define void @test2(ptr %in, ptr %out) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[OUT:%.*]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i64 32
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[OUT]], align 8
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i32>, ptr %in, align 1
+  %1 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = shufflevector <8 x i32> %0, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = zext <4 x i32> %1 to <4 x i64>
+  %4 = zext <4 x i32> %2 to <4 x i64>
+  %5 = shufflevector <4 x i64> %3, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %6 = shufflevector <4 x i64> %3, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %7 = shufflevector <4 x i64> %4, <4 x i64> poison, <2 x i32> <i32 0, i32 1>
+  %8 = shufflevector <4 x i64> %4, <4 x i64> poison, <2 x i32> <i32 2, i32 3>
+  %9 = getelementptr inbounds i64, ptr %out, i64 0
+  %10 = getelementptr inbounds i64, ptr %out, i64 2
+  %11 = getelementptr inbounds i64, ptr %out, i64 4
+  %12 = getelementptr inbounds i64, ptr %out, i64 6
+  store <2 x i64> %5, ptr %9, align 8
+  store <2 x i64> %6, ptr %10, align 8
+  store <2 x i64> %7, ptr %11, align 8
+  store <2 x i64> %8, ptr %12, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll
index d7c3ccd8c9ce8a..59201da1d9ac1a 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec.ll
@@ -147,3 +147,192 @@ entry:
   %5 = icmp ult <4 x ptr> %3, %4
   ret void
 }
+
+define <4 x i1> @test6(ptr %in1, ptr %in2) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[IN1:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[IN2:%.*]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> poison, <4 x i32> poison, i64 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP2]], <4 x i32> poison, i64 8)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP3]], <4 x i32> poison, i64 12)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP4]], <4 x i32> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> poison, <4 x i32> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP7]], <4 x i32> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP8]], <4 x i32> zeroinitializer, i64 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.vector.insert.v16i32.v4i32(<16 x i32> [[TMP9]], <4 x i32> zeroinitializer, i64 12)
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> poison, i64 4)
+; CHECK-NEXT:    [[TMP13:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP12]], <4 x i16> poison, i64 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP13]], <4 x i16> poison, i64 12)
+; CHECK-NEXT:    [[TMP15:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP14]], <4 x i16> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x i16> [[TMP15]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP17]], <4 x i16> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP18]], <4 x i16> zeroinitializer, i64 8)
+; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP19]], <4 x i16> zeroinitializer, i64 12)
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq <16 x i16> [[TMP16]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP11]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i1> [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = shufflevector <16 x i1> [[TMP24]], <16 x i1> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[TMP26:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP25]])
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x i1> poison, i1 [[TMP26]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <16 x i1> [[TMP24]], <16 x i1> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[TMP29:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP28]])
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i1> [[TMP27]], i1 [[TMP29]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <16 x i1> [[TMP24]], <16 x i1> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[TMP32:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP31]])
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i1> [[TMP30]], i1 [[TMP32]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <16 x i1> [[TMP24]], <16 x i1> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP34]])
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <4 x i1> [[TMP33]], i1 [[TMP35]], i64 3
+; CHECK-NEXT:    [[VBSL:%.*]] = select <4 x i1> [[TMP36]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <4 x i32> [[VBSL]], <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i1> [[CMP]]
+;
+entry:
+  %0 = load <4 x i32>, ptr %in1, align 4
+  %1 = load <4 x i16>, ptr %in2, align 2
+  %cmp000 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp001 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp002 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp003 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp100 = icmp eq <4 x i16> %1, zeroinitializer
+  %cmp101 = icmp eq <4 x i16> %1, zeroinitializer
+  %cmp102 = icmp eq <4 x i16> %1, zeroinitializer
+  %cmp103 = icmp eq <4 x i16> %1, zeroinitializer
+  %and.cmp0 = and <4 x i1> %cmp000, %cmp100
+  %and.cmp1 = and <4 x i1> %cmp001, %cmp101
+  %and.cmp2 = and <4 x i1> %cmp002, %cmp102
+  %and.cmp3 = and <4 x i1> %cmp003, %cmp103
+  %cmp004 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp005 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp006 = icmp ugt <4 x i32> %0, zeroinitializer
+  %cmp007 = icmp ugt <4 x i32> %0, zeroinitializer
+  %and.cmp4 = and <4 x i1> %and.cmp0, %cmp004
+  %and.cmp5 = and <4 x i1> %and.cmp1, %cmp005
+  %and.cmp6 = and <4 x i1> %and.cmp2, %cmp006
+  %and.cmp7 = and <4 x i1> %and.cmp3, %cmp007
+  %or0 = or <4 x i1> %and.cmp5, %and.cmp4
+  %or1 = or <4 x i1> %or0, %and.cmp6
+  %or2 = or <4 x i1> %or1, %and.cmp7
+  %vbsl = select <4 x i1> %or2, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>
+  %cmp = icmp ugt <4 x i32> %vbsl, <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i1> %cmp
+}
+
+define void @test7() {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> poison, <8 x i64> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP1]], <8 x i64> zeroinitializer, i64 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <16 x i64> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    store <16 x i16> [[TMP3]], ptr null, align 2
+; CHECK-NEXT:    ret void
+;
+  %1 = getelementptr i8, ptr null, i64 16
+  %2 = trunc <8 x i64> zeroinitializer to <8 x i16>
+  store <8 x i16> %2, ptr %1, align 2
+  %3 = trunc <8 x i64> zeroinitializer to <8 x i16>
+  store <8 x i16> %3, ptr null, align 2
+  ret void
+}
+
+define void @test8() {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> poison, <2 x float> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP0]], <2 x float> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP1]], <2 x float> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v2f32(<8 x float> [[TMP2]], <2 x float> zeroinitializer, i64 6)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> zeroinitializer, i64 2)
+; CHECK-NEXT:    br i1 false, label [[FOR0:%.*]], label [[FOR_BODY:%.*]]
+; CHECK:       for0:
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <8 x float> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x float> [ [[TMP7]], [[FOR_BODY]] ], [ [[TMP5]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP8]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    br i1 false, label [[FOR0]], label [[FOR_BODY]]
+;
+entry:
+  br i1 false, label %for0, label %for.body
+
+for0:
+  %0 = phi <2 x float> [ zeroinitializer, %entry ], [ %4, %for.body ]
+  %1 = phi <2 x float> [ zeroinitializer, %entry ], [ %5, %for.body ]
+  %2 = phi <2 x float> [ zeroinitializer, %entry ], [ %4, %for.body ]
+  %3 = phi <2 x float> [ zeroinitializer, %entry ], [ %5, %for.body ]
+  ret void
+
+for.body:
+  %4 = phi <2 x float> [ %4, %for.body ], [ zeroinitializer, %entry ]
+  %5 = phi <2 x float> [ %5, %for.body ], [ zeroinitializer, %entry ]
+  br i1 false, label %for0, label %for.body
+}
+
+define void @test9() {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4)
+; CHECK-NEXT:    br label [[FOR_BODY13:%.*]]
+; CHECK:       for.body13:
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i1> [[TMP2]] to <8 x i32>
+; CHECK-NEXT:    store <8 x i32> [[TMP3]], ptr null, align 4
+; CHECK-NEXT:    br label [[FOR_BODY13]]
+;
+entry:
+  br label %for.body13
+
+for.body13:                                       ; preds = %for.body13, %entry
+  %vmovl.i111 = sext <4 x i16> zeroinitializer to <4 x i32>
+  %vmovl.i110 = sext <4 x i16> zeroinitializer to <4 x i32>
+  store <4 x i32> %vmovl.i111, ptr null, align 4
+  %add.ptr29 = getelementptr i8, ptr null, i64 16
+  store <4 x i32> %vmovl.i110, ptr %add.ptr29, align 4
+  br label %for.body13
+}
+
+define void @test10() {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr null, align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> poison, <16 x i8> poison, i64 16)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.vector.insert.v32i8.v16i8(<32 x i8> [[TMP1]], <16 x i8> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i8> [[TMP2]], <32 x i8> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <32 x i16> [[TMP6]], <32 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8>
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr null, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <16 x i8>, ptr null, align 1
+  %shuffle.i = shufflevector <16 x i8> %0, <16 x i8> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i107 = shufflevector <16 x i8> %0, <16 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %vmovl.i106 = sext <8 x i8> %shuffle.i to <8 x i16>
+  %vmovl.i = sext <8 x i8> %shuffle.i107 to <8 x i16>
+  %shuffle.i113 = shufflevector <8 x i16> %vmovl.i106, <8 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle.i115 = shufflevector <8 x i16> %vmovl.i106, <8 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shuffle.i112 = shufflevector <8 x i16> %vmovl.i, <8 x i16> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shuffle.i114 = shufflevector <8 x i16> %vmovl.i, <8 x i16> zeroinitializer, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vmovl.i111 = sext <4 x i16> %shuffle.i113 to <4 x i32>
+  %vmovl.i110 = sext <4 x i16> %shuffle.i115 to <4 x i32>
+  %vmovl.i109 = sext <4 x i16> %shuffle.i112 to <4 x i32>
+  %vmovl.i108 = sext <4 x i16> %shuffle.i114 to <4 x i32>
+  %add.ptr29 = getelementptr i8, ptr null, i64 16
+  %add.ptr32 = getelementptr i8, ptr null, i64 32
+  %add.ptr35 = getelementptr i8, ptr null, i64 48
+  store <4 x i32> %vmovl.i111, ptr null, align 4
+  store <4 x i32> %vmovl.i110, ptr %add.ptr29, align 4
+  store <4 x i32> %vmovl.i109, ptr %add.ptr32, align 4
+  store <4 x i32> %vmovl.i108, ptr %add.ptr35, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-missing-probe.prof b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-missing-probe.prof
new file mode 100644
index 00000000000000..cc50d6a2fad2b9
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/Inputs/pseudo-probe-missing-probe.prof
@@ -0,0 +1,13 @@
+main:89650:0
+ 1: 0
+ 2: 16724
+ 3: 16724
+ 4: 14342
+ 5: 15026 bar:15026
+ 6: 1882
+ 8: 16724
+ 9: 0
+ !CFGChecksum: 563091374530180
+bar:15026:15026
+ 1: 15026
+ !CFGChecksum: 4294967295
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-missing-probe.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-missing-probe.ll
new file mode 100644
index 00000000000000..3d559f2fb0159a
--- /dev/null
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-missing-probe.ll
@@ -0,0 +1,243 @@
+; RUN: opt < %s -passes=sample-profile  -sample-profile-file=%S/Inputs/pseudo-probe-missing-probe.prof -S | FileCheck %s
+
+; CHECK:  br i1 %tobool.not.i, label %if.end.i, label %if.then.i, !dbg ![[#]], !prof ![[#PROF:]]
+
+; CHECK:  [[#PROF]] = !{!"branch_weights", i32 918, i32 918}
+; Verify the else branch is not set to a zero count
+; CHECK-NOT:  [[#PROF]] = !{!"branch_weights", i32 1698, i32 0}
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = dso_local global i32 0, align 4, !dbg !0
+
+; Function Attrs: nofree noinline norecurse nounwind memory(readwrite, argmem: none) uwtable
+define dso_local void @bar(i32 %i) local_unnamed_addr #0 !dbg !18 {
+entry:
+    #dbg_value(i32 poison, !22, !DIExpression(), !23)
+  call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1), !dbg !24
+  %0 = load volatile i32, ptr @x, align 4, !dbg !24, !tbaa !25
+  %add = add nsw i32 %0, 5, !dbg !24
+  store volatile i32 %add, ptr @x, align 4, !dbg !24, !tbaa !25
+  ret void, !dbg !29
+}
+
+; Function Attrs: nofree norecurse nounwind memory(readwrite, argmem: none) uwtable
+define dso_local void @baz(i32 noundef %i) local_unnamed_addr #1 !dbg !30 {
+entry:
+    #dbg_value(i32 %i, !32, !DIExpression(), !33)
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 1, i32 0, i64 -1), !dbg !34
+  %rem = srem i32 %i, 100, !dbg !36
+  %tobool.not = icmp eq i32 %rem, 0, !dbg !36
+  br i1 %tobool.not, label %if.end, label %if.then, !dbg !37
+
+if.then:                                          ; preds = %entry
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 2, i32 0, i64 -1), !dbg !38
+  %0 = load volatile i32, ptr @x, align 4, !dbg !38, !tbaa !25
+  %inc = add nsw i32 %0, 1, !dbg !38
+  store volatile i32 %inc, ptr @x, align 4, !dbg !38, !tbaa !25
+  br label %if.end, !dbg !39
+
+if.end:                                           ; preds = %if.then, %entry
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 3, i32 0, i64 -1), !dbg !40
+  %1 = load volatile i32, ptr @x, align 4, !dbg !40, !tbaa !25
+  %add = add nsw i32 %1, 2, !dbg !40
+  store volatile i32 %add, ptr @x, align 4, !dbg !40, !tbaa !25
+  %2 = and i32 %i, 1, !dbg !41
+  %tobool2.not = icmp eq i32 %2, 0, !dbg !41
+  br i1 %tobool2.not, label %if.else, label %if.end11, !dbg !43
+
+if.else:                                          ; preds = %if.end
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 5, i32 0, i64 -1), !dbg !44
+  %rem5 = srem i32 %i, 3, !dbg !46
+  %tobool6.not = icmp eq i32 %rem5, 0, !dbg !46
+  %spec.select = select i1 %tobool6.not, i32 -1, i32 2, !dbg !47
+  br label %if.end11, !dbg !47
+
+if.end11:                                         ; preds = %if.else, %if.end
+  %.sink14 = phi i32 [ 1, %if.end ], [ %spec.select, %if.else ]
+  %3 = load volatile i32, ptr @x, align 4, !dbg !48, !tbaa !25
+  %add8 = add nsw i32 %3, %.sink14, !dbg !48
+  store volatile i32 %add8, ptr @x, align 4, !dbg !48, !tbaa !25
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 9, i32 0, i64 -1), !dbg !49
+  ret void, !dbg !49
+}
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local noundef i32 @main() local_unnamed_addr #2 !dbg !50 {
+entry:
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !55
+    #dbg_value(i32 0, !54, !DIExpression(), !56)
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !57
+  br label %while.body, !dbg !58
+
+while.body:                                       ; preds = %entry, %if.end
+  %inc7 = phi i32 [ 1, %entry ], [ %inc, %if.end ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc7, %if.end ]
+    #dbg_value(i32 %i.06, !54, !DIExpression(), !56)
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 3, i32 0, i64 -1), !dbg !59
+  %rem = urem i32 %inc7, 10, !dbg !62
+  %tobool.not = icmp eq i32 %rem, 0, !dbg !62
+  br i1 %tobool.not, label %if.else, label %if.then, !dbg !63
+
+if.then:                                          ; preds = %while.body
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 4, i32 0, i64 -1), !dbg !64
+  tail call void @bar(i32 poison), !dbg !65
+  br label %if.end, !dbg !67
+
+if.else:                                          ; preds = %while.body
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 6, i32 0, i64 -1), !dbg !68
+    #dbg_value(i32 %inc7, !32, !DIExpression(), !69)
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 1, i32 0, i64 -1), !dbg !72
+  %rem.i4 = urem i32 %inc7, 100, !dbg !73
+  %tobool.not.i = icmp eq i32 %rem.i4, 0, !dbg !73
+  br i1 %tobool.not.i, label %if.end.i, label %if.then.i, !dbg !74
+
+if.then.i:                                        ; preds = %if.else
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 2, i32 0, i64 -1), !dbg !75
+  %0 = load volatile i32, ptr @x, align 4, !dbg !75, !tbaa !25
+  %inc.i = add nsw i32 %0, 1, !dbg !75
+  store volatile i32 %inc.i, ptr @x, align 4, !dbg !75, !tbaa !25
+  br label %if.end.i, !dbg !76
+
+if.end.i:                                         ; preds = %if.then.i, %if.else
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 3, i32 0, i64 -1), !dbg !77
+  %1 = load volatile i32, ptr @x, align 4, !dbg !77, !tbaa !25
+  %add.i = add nsw i32 %1, 2, !dbg !77
+  store volatile i32 %add.i, ptr @x, align 4, !dbg !77, !tbaa !25
+  %2 = and i32 %i.06, 1, !dbg !78
+  %tobool2.not.i.not = icmp eq i32 %2, 0, !dbg !78
+  br i1 %tobool2.not.i.not, label %baz.exit, label %if.else.i, !dbg !79
+
+if.else.i:                                        ; preds = %if.end.i
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 5, i32 0, i64 -1), !dbg !80
+  %rem5.i5 = urem i32 %inc7, 3, !dbg !81
+  %tobool6.not.i = icmp eq i32 %rem5.i5, 0, !dbg !81
+  %spec.select.i = select i1 %tobool6.not.i, i32 -1, i32 2, !dbg !82
+  br label %baz.exit, !dbg !82
+
+baz.exit:                                         ; preds = %if.end.i, %if.else.i
+  %.sink14.i = phi i32 [ 1, %if.end.i ], [ %spec.select.i, %if.else.i ]
+  %3 = load volatile i32, ptr @x, align 4, !dbg !83, !tbaa !25
+  %add8.i = add nsw i32 %3, %.sink14.i, !dbg !83
+  store volatile i32 %add8.i, ptr @x, align 4, !dbg !83, !tbaa !25
+  call void @llvm.pseudoprobe(i64 7546896869197086323, i64 9, i32 0, i64 -1), !dbg !84
+  br label %if.end
+
+if.end:                                           ; preds = %baz.exit, %if.then
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 8, i32 0, i64 -1), !dbg !58
+    #dbg_value(i32 %inc7, !54, !DIExpression(), !56)
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 2, i32 0, i64 -1), !dbg !57
+  %inc = add nuw nsw i32 %inc7, 1, !dbg !57
+    #dbg_value(i32 %inc, !54, !DIExpression(), !56)
+  %exitcond.not = icmp eq i32 %inc, 160000001, !dbg !85
+  br i1 %exitcond.not, label %while.end, label %while.body, !dbg !58, !llvm.loop !86
+
+while.end:                                        ; preds = %if.end
+  call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 9, i32 0, i64 -1), !dbg !89
+  ret i32 0, !dbg !89
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3
+
+attributes #0 = { nofree noinline norecurse nounwind memory(readwrite, argmem: none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nofree norecurse nounwind memory(readwrite, argmem: none) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { nofree norecurse nounwind uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-sample-profile"}
+attributes #3 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
+!llvm.ident = !{!14}
+!llvm.pseudo_probe_desc = !{!15, !16, !17}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !5, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !3, producer: "clang version 20.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false, nameTableKind: None)
+!3 = !DIFile(filename: "test.c", directory: "/home", checksumkind: CSK_MD5, checksum: "b67c15e928f76c51702a59639dbebb4c")
+!4 = !{!0}
+!5 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !6)
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 7, !"Dwarf Version", i32 5}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 8, !"PIC Level", i32 2}
+!11 = !{i32 7, !"PIE Level", i32 2}
+!12 = !{i32 7, !"uwtable", i32 2}
+!13 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!14 = !{!"clang version 20.0.0"}
+!15 = !{i64 -2012135647395072713, i64 4294967295, !"bar"}
+!16 = !{i64 7546896869197086323, i64 191430930410, !"baz"}
+!17 = !{i64 -2624081020897602054, i64 563091374530180, !"main"}
+!18 = distinct !DISubprogram(name: "bar", scope: !3, file: !3, line: 3, type: !19, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
+!19 = !DISubroutineType(types: !20)
+!20 = !{null, !6}
+!21 = !{!22}
+!22 = !DILocalVariable(name: "i", arg: 1, scope: !18, file: !3, line: 3, type: !6)
+!23 = !DILocation(line: 0, scope: !18)
+!24 = !DILocation(line: 4, column: 5, scope: !18)
+!25 = !{!26, !26, i64 0}
+!26 = !{!"int", !27, i64 0}
+!27 = !{!"omnipotent char", !28, i64 0}
+!28 = !{!"Simple C/C++ TBAA"}
+!29 = !DILocation(line: 8, column: 1, scope: !18)
+!30 = distinct !DISubprogram(name: "baz", scope: !3, file: !3, line: 10, type: !19, scopeLine: 10, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !31)
+!31 = !{!32}
+!32 = !DILocalVariable(name: "i", arg: 1, scope: !30, file: !3, line: 10, type: !6)
+!33 = !DILocation(line: 0, scope: !30)
+!34 = !DILocation(line: 11, column: 6, scope: !35)
+!35 = distinct !DILexicalBlock(scope: !30, file: !3, line: 11, column: 6)
+!36 = !DILocation(line: 11, column: 7, scope: !35)
+!37 = !DILocation(line: 11, column: 6, scope: !30)
+!38 = !DILocation(line: 12, column: 6, scope: !35)
+!39 = !DILocation(line: 12, column: 5, scope: !35)
+!40 = !DILocation(line: 14, column: 5, scope: !30)
+!41 = !DILocation(line: 15, column: 9, scope: !42)
+!42 = distinct !DILexicalBlock(scope: !30, file: !3, line: 15, column: 7)
+!43 = !DILocation(line: 15, column: 7, scope: !30)
+!44 = !DILocation(line: 17, column: 12, scope: !45)
+!45 = distinct !DILexicalBlock(scope: !42, file: !3, line: 17, column: 12)
+!46 = !DILocation(line: 17, column: 14, scope: !45)
+!47 = !DILocation(line: 17, column: 12, scope: !42)
+!48 = !DILocation(line: 0, scope: !42)
+!49 = !DILocation(line: 21, column: 1, scope: !30)
+!50 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 23, type: !51, scopeLine: 23, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !53)
+!51 = !DISubroutineType(types: !52)
+!52 = !{!6}
+!53 = !{!54}
+!54 = !DILocalVariable(name: "i", scope: !50, file: !3, line: 24, type: !6)
+!55 = !DILocation(line: 24, column: 7, scope: !50)
+!56 = !DILocation(line: 0, scope: !50)
+!57 = !DILocation(line: 25, column: 11, scope: !50)
+!58 = !DILocation(line: 25, column: 3, scope: !50)
+!59 = !DILocation(line: 26, column: 8, scope: !60)
+!60 = distinct !DILexicalBlock(scope: !61, file: !3, line: 26, column: 8)
+!61 = distinct !DILexicalBlock(scope: !50, file: !3, line: 25, column: 30)
+!62 = !DILocation(line: 26, column: 10, scope: !60)
+!63 = !DILocation(line: 26, column: 8, scope: !61)
+!64 = !DILocation(line: 27, column: 10, scope: !60)
+!65 = !DILocation(line: 27, column: 6, scope: !66)
+!66 = !DILexicalBlockFile(scope: !60, file: !3, discriminator: 455082031)
+!67 = !DILocation(line: 27, column: 6, scope: !60)
+!68 = !DILocation(line: 29, column: 10, scope: !60)
+!69 = !DILocation(line: 0, scope: !30, inlinedAt: !70)
+!70 = distinct !DILocation(line: 29, column: 6, scope: !71)
+!71 = !DILexicalBlockFile(scope: !60, file: !3, discriminator: 455082047)
+!72 = !DILocation(line: 11, column: 6, scope: !35, inlinedAt: !70)
+!73 = !DILocation(line: 11, column: 7, scope: !35, inlinedAt: !70)
+!74 = !DILocation(line: 11, column: 6, scope: !30, inlinedAt: !70)
+!75 = !DILocation(line: 12, column: 6, scope: !35, inlinedAt: !70)
+!76 = !DILocation(line: 12, column: 5, scope: !35, inlinedAt: !70)
+!77 = !DILocation(line: 14, column: 5, scope: !30, inlinedAt: !70)
+!78 = !DILocation(line: 15, column: 9, scope: !42, inlinedAt: !70)
+!79 = !DILocation(line: 15, column: 7, scope: !30, inlinedAt: !70)
+!80 = !DILocation(line: 17, column: 12, scope: !45, inlinedAt: !70)
+!81 = !DILocation(line: 17, column: 14, scope: !45, inlinedAt: !70)
+!82 = !DILocation(line: 17, column: 12, scope: !42, inlinedAt: !70)
+!83 = !DILocation(line: 0, scope: !42, inlinedAt: !70)
+!84 = !DILocation(line: 21, column: 1, scope: !30, inlinedAt: !70)
+!85 = !DILocation(line: 25, column: 14, scope: !50)
+!86 = distinct !{!86, !58, !87, !88}
+!87 = !DILocation(line: 30, column: 3, scope: !50)
+!88 = !{!"llvm.loop.mustprogress"}
+!89 = !DILocation(line: 31, column: 3, scope: !50)
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/DW_OP_implicit_pointer.yaml b/llvm/test/tools/llvm-dwarfdump/X86/DW_OP_implicit_pointer.yaml
new file mode 100644
index 00000000000000..b6b2d1d0e183e1
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/DW_OP_implicit_pointer.yaml
@@ -0,0 +1,87 @@
+# Test that we can decode `DW_OP_implicit_pointer` (0xa0)
+# RUN: yaml2obj %s | llvm-dwarfdump - | FileCheck %s
+
+# CHECK:      DW_TAG_variable
+# CHECK-NEXT:   DW_AT_location (DW_OP_implicit_pointer 0x2a +4)
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_DYN
+  Machine:         EM_X86_64
+DWARF:
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+        - Code:            0x00000002
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_low_pc
+              Form:            DW_FORM_addr
+            - Attribute:       DW_AT_high_pc
+              Form:            DW_FORM_data4
+            - Attribute:       DW_AT_frame_base
+              Form:            DW_FORM_exprloc
+        - Code:            0x00000003
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_location
+              Form:            DW_FORM_exprloc
+        - Code:            0x00000004
+          Tag:             DW_TAG_variable
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_location
+              Form:            DW_FORM_exprloc
+  debug_info:
+    - Length:          52
+      Version:         5
+      UnitType:        DW_UT_compile
+      AbbrOffset:      0
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x000000000000000C
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+            - Value:           0x0000000000000001
+              BlockData:
+                - 0x56
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x0000000000000002
+              BlockData:
+                - 0x91
+                - 0x78
+        - AbbrCode:        0x00000004
+          Values:
+            - Value:           0x0000000000000006
+              BlockData:
+                - 0xa0 # DW_OP_implicit_pointer
+                - 0x2a # Section offset of parameter in the previous entry
+                - 0x00
+                - 0x00
+                - 0x00
+                - 0x04 # Pointer references location 4 bytes into value of previous entry
+        - AbbrCode:        0x00000000
+          Values:
+        - AbbrCode:        0x00000000
+          Values:
+...
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 6fc24f6796310d..42aad6ae507bf6 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1202,8 +1202,8 @@ TEST_F(ScalarEvolutionsTest, SCEVComputeConstantDifference) {
     EXPECT_EQ(diff(ScevIV, ScevIVNext), -1);
     EXPECT_EQ(diff(ScevIVNext, ScevIV), 1);
     EXPECT_EQ(diff(ScevIVNext, ScevIVNext), 0);
-    EXPECT_EQ(diff(ScevIV2P3, ScevIV2), std::nullopt); // TODO
-    EXPECT_EQ(diff(ScevIV2PVar, ScevIV2PVarP3), std::nullopt); // TODO
+    EXPECT_EQ(diff(ScevIV2P3, ScevIV2), 3);
+    EXPECT_EQ(diff(ScevIV2PVar, ScevIV2PVarP3), -3);
     EXPECT_EQ(diff(ScevV0, ScevIV), std::nullopt);
     EXPECT_EQ(diff(ScevIVNext, ScevV3), std::nullopt);
     EXPECT_EQ(diff(ScevYY, ScevV3), std::nullopt);
diff --git a/llvm/unittests/IR/DataLayoutTest.cpp b/llvm/unittests/IR/DataLayoutTest.cpp
index 4b711e87e1a9d2..113bb578f6bc3b 100644
--- a/llvm/unittests/IR/DataLayoutTest.cpp
+++ b/llvm/unittests/IR/DataLayoutTest.cpp
@@ -19,6 +19,27 @@ using namespace llvm;
 
 namespace {
 
+TEST(DataLayoutTest, CopyAssignmentInvalidatesStructLayout) {
+  DataLayout DL1 = cantFail(DataLayout::parse("p:32:32"));
+  DataLayout DL2 = cantFail(DataLayout::parse("p:64:64"));
+
+  LLVMContext Ctx;
+  StructType *Ty = StructType::get(PointerType::getUnqual(Ctx));
+
+  // Initialize struct layout caches.
+  EXPECT_EQ(DL1.getStructLayout(Ty)->getSizeInBits(), 32U);
+  EXPECT_EQ(DL1.getStructLayout(Ty)->getAlignment(), Align(4));
+  EXPECT_EQ(DL2.getStructLayout(Ty)->getSizeInBits(), 64U);
+  EXPECT_EQ(DL2.getStructLayout(Ty)->getAlignment(), Align(8));
+
+  // The copy should invalidate DL1's cache.
+  DL1 = DL2;
+  EXPECT_EQ(DL1.getStructLayout(Ty)->getSizeInBits(), 64U);
+  EXPECT_EQ(DL1.getStructLayout(Ty)->getAlignment(), Align(8));
+  EXPECT_EQ(DL2.getStructLayout(Ty)->getSizeInBits(), 64U);
+  EXPECT_EQ(DL2.getStructLayout(Ty)->getAlignment(), Align(8));
+}
+
 TEST(DataLayoutTest, FunctionPtrAlign) {
   EXPECT_EQ(MaybeAlign(0), DataLayout("").getFunctionPtrAlign());
   EXPECT_EQ(MaybeAlign(1), DataLayout("Fi8").getFunctionPtrAlign());
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index cac8acbe15a79d..953df224e84dcb 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -1244,4 +1244,37 @@ TEST(DIBuilder, HashingDISubprogram) {
   EXPECT_EQ(HashDefinition, HashDefinitionAfter);
 }
 
+TEST(DIBuilder, CompositeTypes) {
+  LLVMContext Ctx;
+  std::unique_ptr<Module> M = std::make_unique<Module>("MyModule", Ctx);
+  DIBuilder DIB(*M);
+
+  DIFile *F = DIB.createFile("main.c", "/");
+  DICompileUnit *CU =
+      DIB.createCompileUnit(dwarf::DW_LANG_C, F, "Test", false, "", 0);
+
+  DICompositeType *Class =
+      DIB.createClassType(CU, "MyClass", F, 0, 8, 8, 0, {}, nullptr, {}, 0,
+                          nullptr, nullptr, "ClassUniqueIdentifier");
+  EXPECT_EQ(Class->getTag(), dwarf::DW_TAG_class_type);
+
+  DICompositeType *Struct = DIB.createStructType(
+      CU, "MyStruct", F, 0, 8, 8, {}, {}, {}, 0, {}, "StructUniqueIdentifier");
+  EXPECT_EQ(Struct->getTag(), dwarf::DW_TAG_structure_type);
+
+  DICompositeType *Union = DIB.createUnionType(CU, "MyUnion", F, 0, 8, 8, {},
+                                               {}, 0, "UnionUniqueIdentifier");
+  EXPECT_EQ(Union->getTag(), dwarf::DW_TAG_union_type);
+
+  DICompositeType *Array = DIB.createArrayType(8, 8, nullptr, {});
+  EXPECT_EQ(Array->getTag(), dwarf::DW_TAG_array_type);
+
+  DICompositeType *Vector = DIB.createVectorType(8, 8, nullptr, {});
+  EXPECT_EQ(Vector->getTag(), dwarf::DW_TAG_array_type);
+
+  DICompositeType *Enum = DIB.createEnumerationType(
+      CU, "MyEnum", F, 0, 8, 8, {}, nullptr, 0, "EnumUniqueIdentifier");
+  EXPECT_EQ(Enum->getTag(), dwarf::DW_TAG_enumeration_type);
+}
+
 } // end namespace
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index ff96df85812002..434cca93ae720a 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -521,11 +521,10 @@ TEST_F(IRBuilderTest, GetIntTy) {
   IntegerType *Ty1 = Builder.getInt1Ty();
   EXPECT_EQ(Ty1, IntegerType::get(Ctx, 1));
 
-  DataLayout* DL = new DataLayout(M.get());
-  IntegerType *IntPtrTy = Builder.getIntPtrTy(*DL);
-  unsigned IntPtrBitSize =  DL->getPointerSizeInBits(0);
+  const DataLayout &DL = M->getDataLayout();
+  IntegerType *IntPtrTy = Builder.getIntPtrTy(DL);
+  unsigned IntPtrBitSize = DL.getPointerSizeInBits(0);
   EXPECT_EQ(IntPtrTy, IntegerType::get(Ctx, IntPtrBitSize));
-  delete DL;
 }
 
 TEST_F(IRBuilderTest, UnaryOperators) {
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 1cd1ca6a418c61..6e1a8f691141fc 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -1497,7 +1497,7 @@ define void @foo(ptr %ptr, <2 x ptr> %ptrs) {
     // Check hasNoUnsignedWrap().
     EXPECT_EQ(GEP->hasNoUnsignedWrap(), LLVMGEP->hasNoUnsignedWrap());
     // Check accumulateConstantOffset().
-    DataLayout DL(M.get());
+    const DataLayout &DL = M->getDataLayout();
     APInt Offset1 =
         APInt::getZero(DL.getIndexSizeInBits(GEP->getPointerAddressSpace()));
     APInt Offset2 =
@@ -1577,7 +1577,7 @@ define void @foo() {
   ret void
 }
 )IR");
-  DataLayout DL(M.get());
+  const DataLayout &DL = M->getDataLayout();
   llvm::Function &LLVMF = *M->getFunction("foo");
   llvm::BasicBlock *LLVMBB = &*LLVMF.begin();
   auto LLVMIt = LLVMBB->begin();
diff --git a/llvm/unittests/Support/DynamicLibrary/CMakeLists.txt b/llvm/unittests/Support/DynamicLibrary/CMakeLists.txt
index d8dff1ef4a3f77..ff0b2f07ee1363 100644
--- a/llvm/unittests/Support/DynamicLibrary/CMakeLists.txt
+++ b/llvm/unittests/Support/DynamicLibrary/CMakeLists.txt
@@ -15,6 +15,8 @@ set_output_directory(DynamicLibraryLib
   LIBRARY_DIR ${LLVM_LIBRARY_OUTPUT_INTDIR}
   )
 
+set(LLVM_EXPORTED_SYMBOL_FILE ${CMAKE_CURRENT_SOURCE_DIR}/DynamicLibraryTests.exports)
+
 add_llvm_unittest(DynamicLibraryTests
   DynamicLibraryTest.cpp
 
@@ -22,6 +24,8 @@ add_llvm_unittest(DynamicLibraryTests
   )
 target_link_libraries(DynamicLibraryTests PRIVATE DynamicLibraryLib)
 
+unset(LLVM_EXPORTED_SYMBOL_FILE)
+
 function(dynlib_add_module NAME)
   add_library(${NAME} MODULE
     PipSqueak.cpp
diff --git a/llvm/unittests/Support/DynamicLibrary/DynamicLibraryTests.exports b/llvm/unittests/Support/DynamicLibrary/DynamicLibraryTests.exports
new file mode 100644
index 00000000000000..a9122211071ee6
--- /dev/null
+++ b/llvm/unittests/Support/DynamicLibrary/DynamicLibraryTests.exports
@@ -0,0 +1 @@
+TestA
diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index 1708af8612bc28..b8e2cc744dc013 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -94,6 +94,7 @@ static_library("AST") {
     "InheritViz.cpp",
     "Interp/ByteCodeEmitter.cpp",
     "Interp/Compiler.cpp",
+    "Interp/CompilerComplex.cpp",
     "Interp/Context.cpp",
     "Interp/Descriptor.cpp",
     "Interp/Disasm.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 293c02473953c7..b38de8d65536b0 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -226,6 +226,9 @@ copy("Headers") {
     "llvm_libc_wrappers/stdlib.h",
     "llvm_libc_wrappers/string.h",
     "llvm_libc_wrappers/time.h",
+    "llvm_offload_wrappers/__llvm_offload.h",
+    "llvm_offload_wrappers/__llvm_offload_device.h",
+    "llvm_offload_wrappers/__llvm_offload_host.h",
     "lsxintrin.h",
     "lwpintrin.h",
     "lzcntintrin.h",
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index aa2b4543927a7f..dddbd837c1cbc7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -126,11 +126,7 @@ def AMDGPU_RawBufferLoadOp :
                    DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
                    OptionalAttr<I32Attr>:$indexOffset,
                    Optional<I32>:$sgprOffset)>,
-    Results<(outs AnyTypeOf<[BF16, F16, F32, I32, I8, F8E5M2FNUZ, F8E4M3FNUZ,
-                              VectorOfLengthAndType<[2, 4], [F32, I32]>,
-                              VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>,
-                              VectorOfLengthAndType<[2, 4, 8, 16],
-                                [I8, F8E5M2FNUZ, F8E4M3FNUZ]>]>:$value)> {
+    Results<(outs AnyType:$value)> {
 
   let summary = "Raw Buffer load, exposing GCN features";
   let description = [{
@@ -176,11 +172,7 @@ def AMDGPU_RawBufferLoadOp :
 def AMDGPU_RawBufferStoreOp :
     AMDGPU_Op<"raw_buffer_store", [AllElementTypesMatch<["value", "memref"]>,
       AttrSizedOperandSegments]>,
-    Arguments<(ins AnyTypeOf<[BF16, F16, F32, I32, I8, F8E5M2FNUZ, F8E4M3FNUZ,
-                              VectorOfLengthAndType<[2, 4], [F32, I32]>,
-                              VectorOfLengthAndType<[2, 4, 8], [F16, BF16]>,
-                              VectorOfLengthAndType<[2, 4, 8, 16],
-                                [I8, F8E5M2FNUZ, F8E4M3FNUZ]>]>:$value,
+    Arguments<(ins AnyType:$value,
                    Arg<AnyMemRef, "buffer to store to", [MemWrite]>:$memref,
                    Variadic<I32>:$indices,
                    DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
index 57acd72610415f..7b53594a1c8e28 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h
@@ -22,6 +22,7 @@
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/RegionKindInterface.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index c57d291552e606..a024c3018eb8d3 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -21,6 +21,7 @@ include "mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td"
 include "mlir/Dialect/GPU/IR/ParallelLoopMapperAttr.td"
 include "mlir/IR/CommonTypeConstraints.td"
 include "mlir/IR/EnumAttr.td"
+include "mlir/IR/RegionKindInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/DataLayoutInterfaces.td"
@@ -1347,10 +1348,7 @@ def GPU_BarrierOp : GPU_Op<"barrier"> {
 
 def GPU_GPUModuleOp : GPU_Op<"module", [
       DataLayoutOpInterface, HasDefaultDLTIDataLayout, IsolatedFromAbove,
-      SymbolTable, Symbol, SingleBlockImplicitTerminator<"ModuleEndOp">
-    ]>, Arguments<(ins SymbolNameAttr:$sym_name,
-          OptionalAttr<GPUNonEmptyTargetArrayAttr>:$targets,
-          OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler)> {
+      NoRegionArguments, SymbolTable, Symbol] # GraphRegionNoTerminator.traits> {
   let summary = "A top level compilation unit containing code to be run on a GPU.";
   let description = [{
     GPU module contains code that is intended to be run on a GPU. A host device
@@ -1379,7 +1377,6 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
     gpu.module @symbol_name {
       gpu.func {}
         ...
-      gpu.module_end
     }
     // Module with offloading handler and target attributes.
     gpu.module @symbol_name2 <#gpu.select_object<1>> [
@@ -1387,7 +1384,6 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
         #rocdl.target<chip = "gfx90a">] {
       gpu.func {}
         ...
-      gpu.module_end
     }
     ```
   }];
@@ -1399,8 +1395,18 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
                    "ArrayRef<Attribute>":$targets,
                    CArg<"Attribute", "{}">:$handler)>
   ];
+
+  let arguments = (ins
+      SymbolNameAttr:$sym_name,
+      OptionalAttr<GPUNonEmptyTargetArrayAttr>:$targets,
+      OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler);
   let regions = (region SizedRegion<1>:$bodyRegion);
-  let hasCustomAssemblyFormat = 1;
+  let assemblyFormat = [{
+    $sym_name
+    (`<` $offloadingHandler^ `>`)?
+    ($targets^)?
+    attr-dict-with-keyword $bodyRegion
+  }];
 
   // We need to ensure the block inside the region is properly terminated;
   // the auto-generated builders do not guarantee that.
@@ -1415,17 +1421,6 @@ def GPU_GPUModuleOp : GPU_Op<"module", [
   }];
 }
 
-def GPU_ModuleEndOp : GPU_Op<"module_end", [
-  Terminator, HasParent<"GPUModuleOp">
-]> {
-  let summary = "A pseudo op that marks the end of a gpu.module.";
-  let description = [{
-    This op terminates the only block inside the only region of a `gpu.module`.
-  }];
-
-  let assemblyFormat = "attr-dict";
-}
-
 def GPU_BinaryOp : GPU_Op<"binary", [Symbol]>, Arguments<(ins
       SymbolNameAttr:$sym_name,
       OptionalAttr<OffloadingTranslationAttr>:$offloadingHandler,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index c38a2584c8eec1..643522d5903fd0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -1623,10 +1623,15 @@ def LLVM_ConstantOp
     vectors. It has a mandatory `value` attribute, which may be an integer,
     floating point attribute; dense or sparse attribute containing integers or
     floats. The type of the attribute is one of the corresponding MLIR builtin
-    types. It may be omitted for `i64` and `f64` types that are implied. The
-    operation produces a new SSA value of the specified LLVM IR dialect type.
-    The type of that value _must_ correspond to the attribute type converted to
-    LLVM IR.
+    types. It may be omitted for `i64` and `f64` types that are implied.
+
+    The operation produces a new SSA value of the specified LLVM IR dialect
+    type. Certain builtin types such as integer, float and vector types are
+    also allowed. The result type _must_ correspond to the attribute type
+    converted to LLVM IR. In particular, the number of elements of a container
+    type must match the number of elements in the attribute. If the type is or
+    contains a scalable vector type, the attribute must be a splat elements
+    attribute.
 
     Examples:
 
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index ed7b9ece4a464b..896fdf1c899e3d 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -289,7 +289,7 @@ class RewritePattern : public Pattern {
   using Pattern::Pattern;
 
 private:
-  /// Trait to check if T provides a `getOperationName` method.
+  /// Trait to check if T provides a `initialize` method.
   template <typename T, typename... Args>
   using has_initialize = decltype(std::declval<T>().initialize());
   template <typename T>
diff --git a/mlir/include/mlir/IR/StorageUniquerSupport.h b/mlir/include/mlir/IR/StorageUniquerSupport.h
index d6ccbbd8579947..2162a74a51580a 100644
--- a/mlir/include/mlir/IR/StorageUniquerSupport.h
+++ b/mlir/include/mlir/IR/StorageUniquerSupport.h
@@ -226,9 +226,7 @@ class StorageUserBase : public BaseT, public Traits<ConcreteT>... {
 
   /// Default implementation that just returns success.
   template <typename... Args>
-  static LogicalResult
-  verifyInvariants(function_ref<InFlightDiagnostic()> emitErrorFn,
-                   Args... args) {
+  static LogicalResult verifyInvariants(Args... args) {
     return success();
   }
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 060a1e1e82f75e..9957a5804c0b65 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -316,7 +316,7 @@ void mlir::configureGpuToNVVMConversionLegality(ConversionTarget &target) {
                       LLVM::SinOp, LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
-  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
+  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
 }
 
 template <typename OpTy>
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index 564bab1ad92b90..93e8b080a4f672 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -335,7 +335,7 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
                       LLVM::SqrtOp>();
 
   // TODO: Remove once we support replacing non-root ops.
-  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
+  target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp>();
 }
 
 template <typename OpTy>
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
index 98340bf653d61f..b18b6344732eeb 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRV.cpp
@@ -90,19 +90,6 @@ class GPUModuleConversion final : public OpConversionPattern<gpu::GPUModuleOp> {
                   ConversionPatternRewriter &rewriter) const override;
 };
 
-class GPUModuleEndConversion final
-    : public OpConversionPattern<gpu::ModuleEndOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult
-  matchAndRewrite(gpu::ModuleEndOp endOp, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    rewriter.eraseOp(endOp);
-    return success();
-  }
-};
-
 /// Pattern to convert a gpu.return into a SPIR-V return.
 // TODO: This can go to DRR when GPU return has operands.
 class GPUReturnOpConversion final : public OpConversionPattern<gpu::ReturnOp> {
@@ -614,7 +601,7 @@ void mlir::populateGPUToSPIRVPatterns(SPIRVTypeConverter &typeConverter,
                                       RewritePatternSet &patterns) {
   patterns.add<
       GPUBarrierConversion, GPUFuncOpConversion, GPUModuleConversion,
-      GPUModuleEndConversion, GPUReturnOpConversion, GPUShuffleConversion,
+      GPUReturnOpConversion, GPUShuffleConversion,
       LaunchConfigConversion<gpu::BlockIdOp, spirv::BuiltIn::WorkgroupId>,
       LaunchConfigConversion<gpu::GridDimOp, spirv::BuiltIn::NumWorkgroups>,
       LaunchConfigConversion<gpu::BlockDimOp, spirv::BuiltIn::WorkgroupSize>,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index a1f87a637a6141..eeffe829446cf9 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1736,8 +1736,7 @@ LogicalResult gpu::ReturnOp::verify() {
 void GPUModuleOp::build(OpBuilder &builder, OperationState &result,
                         StringRef name, ArrayAttr targets,
                         Attribute offloadingHandler) {
-  ensureTerminator(*result.addRegion(), builder, result.location);
-
+  result.addRegion()->emplaceBlock();
   Properties &props = result.getOrAddProperties<Properties>();
   if (targets)
     props.targets = targets;
@@ -1753,73 +1752,6 @@ void GPUModuleOp::build(OpBuilder &builder, OperationState &result,
         offloadingHandler);
 }
 
-ParseResult GPUModuleOp::parse(OpAsmParser &parser, OperationState &result) {
-  StringAttr nameAttr;
-  ArrayAttr targetsAttr;
-
-  if (parser.parseSymbolName(nameAttr))
-    return failure();
-
-  Properties &props = result.getOrAddProperties<Properties>();
-  props.setSymName(nameAttr);
-
-  // Parse the optional offloadingHandler
-  if (succeeded(parser.parseOptionalLess())) {
-    if (parser.parseAttribute(props.offloadingHandler))
-      return failure();
-    if (parser.parseGreater())
-      return failure();
-  }
-
-  // Parse the optional array of target attributes.
-  OptionalParseResult targetsAttrResult =
-      parser.parseOptionalAttribute(targetsAttr, Type{});
-  if (targetsAttrResult.has_value()) {
-    if (failed(*targetsAttrResult)) {
-      return failure();
-    }
-    props.targets = targetsAttr;
-  }
-
-  // If module attributes are present, parse them.
-  if (parser.parseOptionalAttrDictWithKeyword(result.attributes))
-    return failure();
-
-  // Parse the module body.
-  auto *body = result.addRegion();
-  if (parser.parseRegion(*body, {}))
-    return failure();
-
-  // Ensure that this module has a valid terminator.
-  GPUModuleOp::ensureTerminator(*body, parser.getBuilder(), result.location);
-  return success();
-}
-
-void GPUModuleOp::print(OpAsmPrinter &p) {
-  p << ' ';
-  p.printSymbolName(getName());
-
-  if (Attribute attr = getOffloadingHandlerAttr()) {
-    p << " <";
-    p.printAttribute(attr);
-    p << ">";
-  }
-
-  if (Attribute attr = getTargetsAttr()) {
-    p << ' ';
-    p.printAttribute(attr);
-    p << ' ';
-  }
-
-  p.printOptionalAttrDictWithKeyword((*this)->getAttrs(),
-                                     {mlir::SymbolTable::getSymbolAttrName(),
-                                      getTargetsAttrName(),
-                                      getOffloadingHandlerAttrName()});
-  p << ' ';
-  p.printRegion(getRegion(), /*printEntryBlockArgs=*/false,
-                /*printBlockTerminators=*/false);
-}
-
 bool GPUModuleOp::hasTarget(Attribute target) {
   if (ArrayAttr targets = getTargetsAttr())
     return llvm::count(targets.getValue(), target);
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 90610118a45cd2..07262bb8e1bacb 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -2666,6 +2666,39 @@ OpFoldResult LLVM::ZeroOp::fold(FoldAdaptor) {
 // ConstantOp.
 //===----------------------------------------------------------------------===//
 
+/// Compute the total number of elements in the given type, also taking into
+/// account nested types. Supported types are `VectorType`, `LLVMArrayType` and
+/// `LLVMFixedVectorType`. Everything else is treated as a scalar.
+static int64_t getNumElements(Type t) {
+  if (auto vecType = dyn_cast<VectorType>(t))
+    return vecType.getNumElements() * getNumElements(vecType.getElementType());
+  if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(t))
+    return arrayType.getNumElements() *
+           getNumElements(arrayType.getElementType());
+  if (auto vecType = dyn_cast<LLVMFixedVectorType>(t))
+    return vecType.getNumElements() * getNumElements(vecType.getElementType());
+  assert(!isa<LLVM::LLVMScalableVectorType>(t) &&
+         "number of elements of a scalable vector type is unknown");
+  return 1;
+}
+
+/// Check if the given type is a scalable vector type or a vector/array type
+/// that contains a nested scalable vector type.
+static bool hasScalableVectorType(Type t) {
+  if (isa<LLVM::LLVMScalableVectorType>(t))
+    return true;
+  if (auto vecType = dyn_cast<VectorType>(t)) {
+    if (vecType.isScalable())
+      return true;
+    return hasScalableVectorType(vecType.getElementType());
+  }
+  if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(t))
+    return hasScalableVectorType(arrayType.getElementType());
+  if (auto vecType = dyn_cast<LLVMFixedVectorType>(t))
+    return hasScalableVectorType(vecType.getElementType());
+  return false;
+}
+
 LogicalResult LLVM::ConstantOp::verify() {
   if (StringAttr sAttr = llvm::dyn_cast<StringAttr>(getValue())) {
     auto arrayType = llvm::dyn_cast<LLVMArrayType>(getType());
@@ -2708,14 +2741,12 @@ LogicalResult LLVM::ConstantOp::verify() {
   if (auto targetExtType = dyn_cast<LLVMTargetExtType>(getType())) {
     return emitOpError() << "does not support target extension type.";
   }
-  if (!llvm::isa<IntegerAttr, ArrayAttr, FloatAttr, ElementsAttr>(getValue()))
-    return emitOpError()
-           << "only supports integer, float, string or elements attributes";
+
+  // Verification of IntegerAttr, FloatAttr, ElementsAttr, ArrayAttr.
   if (auto intAttr = dyn_cast<IntegerAttr>(getValue())) {
     if (!llvm::isa<IntegerType>(getType()))
       return emitOpError() << "expected integer type";
-  }
-  if (auto floatAttr = dyn_cast<FloatAttr>(getValue())) {
+  } else if (auto floatAttr = dyn_cast<FloatAttr>(getValue())) {
     const llvm::fltSemantics &sem = floatAttr.getValue().getSemantics();
     unsigned floatWidth = APFloat::getSizeInBits(sem);
     if (auto floatTy = dyn_cast<FloatType>(getType())) {
@@ -2728,13 +2759,34 @@ LogicalResult LLVM::ConstantOp::verify() {
     if (isa<IntegerType>(getType()) && !getType().isInteger(floatWidth)) {
       return emitOpError() << "expected integer type of width " << floatWidth;
     }
-  }
-  if (auto splatAttr = dyn_cast<SplatElementsAttr>(getValue())) {
-    if (!isa<VectorType>(getType()) && !isa<LLVM::LLVMArrayType>(getType()) &&
-        !isa<LLVM::LLVMFixedVectorType>(getType()) &&
-        !isa<LLVM::LLVMScalableVectorType>(getType()))
+  } else if (isa<ElementsAttr, ArrayAttr>(getValue())) {
+    if (hasScalableVectorType(getType())) {
+      // The exact number of elements of a scalable vector is unknown, so we
+      // allow only splat attributes.
+      auto splatElementsAttr = dyn_cast<SplatElementsAttr>(getValue());
+      if (!splatElementsAttr)
+        return emitOpError()
+               << "scalable vector type requires a splat attribute";
+      return success();
+    }
+    if (!isa<VectorType, LLVM::LLVMArrayType, LLVM::LLVMFixedVectorType>(
+            getType()))
       return emitOpError() << "expected vector or array type";
+    // The number of elements of the attribute and the type must match.
+    int64_t attrNumElements;
+    if (auto elementsAttr = dyn_cast<ElementsAttr>(getValue()))
+      attrNumElements = elementsAttr.getNumElements();
+    else
+      attrNumElements = cast<ArrayAttr>(getValue()).size();
+    if (getNumElements(getType()) != attrNumElements)
+      return emitOpError()
+             << "type and attribute have a different number of elements: "
+             << getNumElements(getType()) << " vs. " << attrNumElements;
+  } else {
+    return emitOpError()
+           << "only supports integer, float, string or elements attributes";
   }
+
   return success();
 }
 
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir
index 7b873463a5f98f..1a22ba662cbf74 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir
@@ -67,7 +67,7 @@ module attributes {transform.with_named_sequence} {
         {index_bitwidth = 32, use_opaque_pointers = true}
     } {
       legal_dialects = ["llvm", "memref", "nvvm"],
-      legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"],
+      legal_ops = ["func.func", "gpu.module", "gpu.yield"],
       illegal_dialects = ["gpu"],
       illegal_ops = ["llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
                     "llvm.ffloor", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow",
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index c23b11e46b24c7..8f2ec289c9252c 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -942,7 +942,7 @@ module attributes {transform.with_named_sequence} {
         use_bare_ptr_call_conv = false}
     } {
       legal_dialects = ["llvm", "memref", "nvvm", "test"],
-      legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"],
+      legal_ops = ["func.func", "gpu.module", "gpu.yield"],
       illegal_dialects = ["gpu"],
       illegal_ops = ["llvm.copysign", "llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil",
                     "llvm.ffloor", "llvm.fma", "llvm.frem", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow",
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index d164e875097968..9b61c4493994c2 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -1049,8 +1049,8 @@ func.func @shuffle_2D(%a: vector<1x4xf32>, %b: vector<2x4xf32>) -> vector<3x4xf3
 
 // -----
 
-// CHECK-LABEL: @extract_element_0d
-func.func @extract_element_0d(%a: vector<f32>) -> f32 {
+// CHECK-LABEL: @extractelement_0d
+func.func @extractelement_0d(%a: vector<f32>) -> f32 {
   // CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : i64
   // CHECK: llvm.extractelement %{{.*}}[%[[C0]] : {{.*}}] : vector<1xf32>
   %1 = vector.extractelement %a[] : vector<f32>
@@ -1059,31 +1059,54 @@ func.func @extract_element_0d(%a: vector<f32>) -> f32 {
 
 // -----
 
-func.func @extract_element(%arg0: vector<16xf32>) -> f32 {
+func.func @extractelement(%arg0: vector<16xf32>) -> f32 {
   %0 = arith.constant 15 : i32
   %1 = vector.extractelement %arg0[%0 : i32]: vector<16xf32>
   return %1 : f32
 }
-// CHECK-LABEL: @extract_element(
+// CHECK-LABEL: @extractelement(
 // CHECK-SAME: %[[A:.*]]: vector<16xf32>)
 //       CHECK:   %[[c:.*]] = arith.constant 15 : i32
 //       CHECK:   %[[x:.*]] = llvm.extractelement %[[A]][%[[c]] : i32] : vector<16xf32>
 //       CHECK:   return %[[x]] : f32
 
+func.func @extractelement_scalable(%arg0: vector<[16]xf32>) -> f32 {
+  %0 = arith.constant 15 : i32
+  %1 = vector.extractelement %arg0[%0 : i32]: vector<[16]xf32>
+  return %1 : f32
+}
+// CHECK-LABEL: @extractelement_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>)
+//       CHECK:   %[[c:.*]] = arith.constant 15 : i32
+//       CHECK:   %[[x:.*]] = llvm.extractelement %[[A]][%[[c]] : i32] : vector<[16]xf32>
+//       CHECK:   return %[[x]] : f32
+
 // -----
 
-func.func @extract_element_index(%arg0: vector<16xf32>) -> f32 {
+func.func @extractelement_index(%arg0: vector<16xf32>) -> f32 {
   %0 = arith.constant 15 : index
   %1 = vector.extractelement %arg0[%0 : index]: vector<16xf32>
   return %1 : f32
 }
-// CHECK-LABEL: @extract_element_index(
+// CHECK-LABEL: @extractelement_index(
 // CHECK-SAME: %[[A:.*]]: vector<16xf32>)
 //       CHECK:   %[[c:.*]] = arith.constant 15 : index
 //       CHECK:   %[[i:.*]] = builtin.unrealized_conversion_cast %[[c]] : index to i64
 //       CHECK:   %[[x:.*]] = llvm.extractelement %[[A]][%[[i]] : i64] : vector<16xf32>
 //       CHECK:   return %[[x]] : f32
 
+func.func @extractelement_index_scalable(%arg0: vector<[16]xf32>) -> f32 {
+  %0 = arith.constant 15 : index
+  %1 = vector.extractelement %arg0[%0 : index]: vector<[16]xf32>
+  return %1 : f32
+}
+// CHECK-LABEL: @extractelement_index_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xf32>)
+//       CHECK:   %[[c:.*]] = arith.constant 15 : index
+//       CHECK:   %[[i:.*]] = builtin.unrealized_conversion_cast %[[c]] : index to i64
+//       CHECK:   %[[x:.*]] = llvm.extractelement %[[A]][%[[i]] : i64] : vector<[16]xf32>
+//       CHECK:   return %[[x]] : f32
+
 // -----
 
 func.func @extract_element_from_vec_1d(%arg0: vector<16xf32>) -> f32 {
@@ -1095,6 +1118,15 @@ func.func @extract_element_from_vec_1d(%arg0: vector<16xf32>) -> f32 {
 //       CHECK:   llvm.extractelement {{.*}}[{{.*}} : i64] : vector<16xf32>
 //       CHECK:   return {{.*}} : f32
 
+func.func @extract_element_from_vec_1d_scalable(%arg0: vector<[16]xf32>) -> f32 {
+  %0 = vector.extract %arg0[15]: f32 from vector<[16]xf32>
+  return %0 : f32
+}
+// CHECK-LABEL: @extract_element_from_vec_1d_scalable
+//       CHECK:   llvm.mlir.constant(15 : i64) : i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : i64] : vector<[16]xf32>
+//       CHECK:   return {{.*}} : f32
+
 // -----
 
 func.func @extract_index_element_from_vec_1d(%arg0: vector<16xindex>) -> index {
@@ -1109,6 +1141,18 @@ func.func @extract_index_element_from_vec_1d(%arg0: vector<16xindex>) -> index {
 //       CHECK:   %[[T3:.*]] = builtin.unrealized_conversion_cast %[[T2]] : i64 to index
 //       CHECK:   return %[[T3]] : index
 
+func.func @extract_index_element_from_vec_1d_scalable(%arg0: vector<[16]xindex>) -> index {
+  %0 = vector.extract %arg0[15]: index from vector<[16]xindex>
+  return %0 : index
+}
+// CHECK-LABEL: @extract_index_element_from_vec_1d_scalable(
+// CHECK-SAME: %[[A:.*]]: vector<[16]xindex>)
+//       CHECK:   %[[T0:.*]] = builtin.unrealized_conversion_cast %[[A]] : vector<[16]xindex> to vector<[16]xi64>
+//       CHECK:   %[[T1:.*]] = llvm.mlir.constant(15 : i64) : i64
+//       CHECK:   %[[T2:.*]] = llvm.extractelement %[[T0]][%[[T1]] : i64] : vector<[16]xi64>
+//       CHECK:   %[[T3:.*]] = builtin.unrealized_conversion_cast %[[T2]] : i64 to index
+//       CHECK:   return %[[T3]] : index
+
 // -----
 
 func.func @extract_vec_2d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<3x16xf32> {
@@ -1119,6 +1163,14 @@ func.func @extract_vec_2d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<3x16x
 //       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm.array<4 x array<3 x vector<16xf32>>>
 //       CHECK:   return {{.*}} : vector<3x16xf32>
 
+func.func @extract_vec_2d_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> vector<3x[16]xf32> {
+  %0 = vector.extract %arg0[0]: vector<3x[16]xf32> from vector<4x3x[16]xf32>
+  return %0 : vector<3x[16]xf32>
+}
+// CHECK-LABEL: @extract_vec_2d_from_vec_3d_scalable
+//       CHECK:   llvm.extractvalue {{.*}}[0] : !llvm.array<4 x array<3 x vector<[16]xf32>>>
+//       CHECK:   return {{.*}} : vector<3x[16]xf32>
+
 // -----
 
 func.func @extract_vec_1d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<16xf32> {
@@ -1129,6 +1181,14 @@ func.func @extract_vec_1d_from_vec_3d(%arg0: vector<4x3x16xf32>) -> vector<16xf3
 //       CHECK:   llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<16xf32>>>
 //       CHECK:   return {{.*}} : vector<16xf32>
 
+func.func @extract_vec_1d_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> vector<[16]xf32> {
+  %0 = vector.extract %arg0[0, 0]: vector<[16]xf32> from vector<4x3x[16]xf32>
+  return %0 : vector<[16]xf32>
+}
+// CHECK-LABEL: @extract_vec_1d_from_vec_3d_scalable
+//       CHECK:   llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<[16]xf32>>>
+//       CHECK:   return {{.*}} : vector<[16]xf32>
+
 // -----
 
 func.func @extract_element_from_vec_3d(%arg0: vector<4x3x16xf32>) -> f32 {
@@ -1141,6 +1201,16 @@ func.func @extract_element_from_vec_3d(%arg0: vector<4x3x16xf32>) -> f32 {
 //       CHECK:   llvm.extractelement {{.*}}[{{.*}} : i64] : vector<16xf32>
 //       CHECK:   return {{.*}} : f32
 
+func.func @extract_element_from_vec_3d_scalable(%arg0: vector<4x3x[16]xf32>) -> f32 {
+  %0 = vector.extract %arg0[0, 0, 0]: f32 from vector<4x3x[16]xf32>
+  return %0 : f32
+}
+// CHECK-LABEL: @extract_element_from_vec_3d_scalable
+//       CHECK:   llvm.extractvalue {{.*}}[0, 0] : !llvm.array<4 x array<3 x vector<[16]xf32>>>
+//       CHECK:   llvm.mlir.constant(0 : i64) : i64
+//       CHECK:   llvm.extractelement {{.*}}[{{.*}} : i64] : vector<[16]xf32>
+//       CHECK:   return {{.*}} : f32
+
 // -----
 
 func.func @extract_element_with_value_1d(%arg0: vector<16xf32>, %arg1: index) -> f32 {
@@ -1152,6 +1222,15 @@ func.func @extract_element_with_value_1d(%arg0: vector<16xf32>, %arg1: index) ->
 //       CHECK:   %[[UC:.+]] = builtin.unrealized_conversion_cast %[[INDEX]] : index to i64
 //       CHECK:   llvm.extractelement %[[VEC]][%[[UC]] : i64] : vector<16xf32>
 
+func.func @extract_element_with_value_1d_scalable(%arg0: vector<[16]xf32>, %arg1: index) -> f32 {
+  %0 = vector.extract %arg0[%arg1]: f32 from vector<[16]xf32>
+  return %0 : f32
+}
+// CHECK-LABEL: @extract_element_with_value_1d_scalable
+//  CHECK-SAME:   %[[VEC:.+]]: vector<[16]xf32>, %[[INDEX:.+]]: index
+//       CHECK:   %[[UC:.+]] = builtin.unrealized_conversion_cast %[[INDEX]] : index to i64
+//       CHECK:   llvm.extractelement %[[VEC]][%[[UC]] : i64] : vector<[16]xf32>
+
 // -----
 
 func.func @extract_element_with_value_2d(%arg0: vector<1x16xf32>, %arg1: index) -> f32 {
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index fe288dab973f5a..62346ce0d2c4b1 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -414,6 +414,22 @@ llvm.func @struct_wrong_element_types() -> !llvm.struct<(!llvm.array<2 x f64>, !
 
 // -----
 
+llvm.func @const_wrong_number_of_elements() -> vector<5xf64> {
+  // expected-error @+1{{type and attribute have a different number of elements: 5 vs. 2}}
+  %0 = llvm.mlir.constant(dense<[1.0, 1.0]> : tensor<2xf64>) : vector<5xf64>
+  llvm.return %0 : vector<5xf64>
+}
+
+// -----
+
+llvm.func @scalable_vec_requires_splat() -> vector<[4]xf64> {
+  // expected-error @+1{{scalable vector type requires a splat attribute}}
+  %0 = llvm.mlir.constant(dense<[1.0, 1.0, 2.0, 2.0]> : tensor<4xf64>) : vector<[4]xf64>
+  llvm.return %0 : vector<[4]xf64>
+}
+
+// -----
+
 func.func @insertvalue_non_llvm_type(%a : i32, %b : i32) {
   // expected-error@+2 {{expected LLVM IR Dialect type}}
   llvm.insertvalue %a, %b[0] : tensor<*xi32>
diff --git a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir
index eb0db736d5da58..75c5ad26fcf231 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir
@@ -1,364 +1,367 @@
-// RUN: mlir-opt %s -test-vector-transfer-unrolling-patterns --split-input-file | FileCheck %s
-// RUN: mlir-opt %s -test-vector-transfer-unrolling-patterns=reverse-unroll-order --split-input-file | FileCheck %s --check-prefix=ORDER
-
-// CHECK-LABEL: func @transfer_read_unroll
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   return %[[VEC3]] : vector<4x4xf32>
-
-// ORDER-LABEL: func @transfer_read_unroll
-//       ORDER-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       ORDER-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       ORDER:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  ORDER-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  ORDER-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  ORDER-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  ORDER-NEXT:   return %[[VEC3]] : vector<4x4xf32>
-
-func.func @transfer_read_unroll(%arg0 : memref<4x4xf32>) -> vector<4x4xf32> {
+// RUN: mlir-opt %s -test-vector-transfer-unrolling-patterns --split-input-file | FileCheck %s --check-prefix=ALL
+// RUN: mlir-opt %s -test-vector-transfer-unrolling-patterns=reverse-unroll-order --split-input-file | FileCheck %s --check-prefixes=ALL,ORDER
+
+// ALL-LABEL:   func @transfer_read_unroll
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    return %[[VEC3]] : vector<4x4xf32>
+
+// ORDER-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// ORDER-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// ORDER:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// ORDER-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// ORDER-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// ORDER-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// ORDER-NEXT:    return %[[VEC3]] : vector<4x4xf32>
+
+func.func @transfer_read_unroll(%mem : memref<4x4xf32>) -> vector<4x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 : memref<4x4xf32>, vector<4x4xf32>
-  return %0 : vector<4x4xf32>
+  %res = vector.transfer_read %mem[%c0, %c0], %cf0 : memref<4x4xf32>, vector<4x4xf32>
+  return %res : vector<4x4xf32>
 }
 
-// CHECK-LABEL: func @transfer_write_unroll
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[S0:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[S0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   %[[S1:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[S1]], {{.*}}[%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   %[[S2:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[S2]], {{.*}}[%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   %[[S3:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[S3]], {{.*}}[%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   return
-
-// ORDER-LABEL: func @transfer_write_unroll
-//       ORDER-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       ORDER-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       ORDER:   %[[S0:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   vector.transfer_write %[[S0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  ORDER-NEXT:   %[[S1:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   vector.transfer_write %[[S1]], {{.*}}[%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  ORDER-NEXT:   %[[S2:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   vector.transfer_write %[[S2]], {{.*}}[%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  ORDER-NEXT:   %[[S3:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   vector.transfer_write %[[S3]], {{.*}}[%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  ORDER-NEXT:   return
-
-func.func @transfer_write_unroll(%arg0 : memref<4x4xf32>, %arg1 : vector<4x4xf32>) {
+// -----
+
+// ALL-LABEL:   func @transfer_write_unroll
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[S0:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    vector.transfer_write %[[S0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    %[[S1:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    vector.transfer_write %[[S1]], {{.*}}[%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    %[[S2:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    vector.transfer_write %[[S2]], {{.*}}[%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    %[[S3:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    vector.transfer_write %[[S3]], {{.*}}[%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    return
+
+// ORDER-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// ORDER-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// ORDER:         %[[S0:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    vector.transfer_write %[[S0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// ORDER-NEXT:    %[[S1:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    vector.transfer_write %[[S1]], {{.*}}[%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// ORDER-NEXT:    %[[S2:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    vector.transfer_write %[[S2]], {{.*}}[%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// ORDER-NEXT:    %[[S3:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    vector.transfer_write %[[S3]], {{.*}}[%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// ORDER-NEXT:    return
+
+func.func @transfer_write_unroll(%mem : memref<4x4xf32>, %vec : vector<4x4xf32>) {
   %c0 = arith.constant 0 : index
-  vector.transfer_write %arg1, %arg0[%c0, %c0] : vector<4x4xf32>, memref<4x4xf32>
+  vector.transfer_write %vec, %mem[%c0, %c0] : vector<4x4xf32>, memref<4x4xf32>
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @transfer_readwrite_unroll
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[VTR0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[VTR1]], {{.*}}[%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[VTR2]], {{.*}}[%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   vector.transfer_write %[[VTR3]], {{.*}}[%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
-//  CHECK-NEXT:   return
-
-func.func @transfer_readwrite_unroll(%arg0 : memref<4x4xf32>) {
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    vector.transfer_write %[[VTR0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    vector.transfer_write %[[VTR1]], {{.*}}[%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    vector.transfer_write %[[VTR2]], {{.*}}[%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    vector.transfer_write %[[VTR3]], {{.*}}[%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT:    return
+
+func.func @transfer_readwrite_unroll(%mem : memref<4x4xf32>) {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 : memref<4x4xf32>, vector<4x4xf32>
-  vector.transfer_write %0, %arg0[%c0, %c0] : vector<4x4xf32>, memref<4x4xf32>
+  %0 = vector.transfer_read %mem[%c0, %c0], %cf0 : memref<4x4xf32>, vector<4x4xf32>
+  vector.transfer_write %0, %mem[%c0, %c0] : vector<4x4xf32>, memref<4x4xf32>
   return
 }
 
+// -----
+
 // CHECK-LABEL: func @transfer_read_unroll_tensor
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
-//  CHECK-NEXT:   return %[[VEC3]] : vector<4x4xf32>
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+// CHECK-NEXT:    return %[[VEC3]] : vector<4x4xf32>
 
 func.func @transfer_read_unroll_tensor(%arg0 : tensor<4x4xf32>) -> vector<4x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 : tensor<4x4xf32>, vector<4x4xf32>
-  return %0 : vector<4x4xf32>
+  %res = vector.transfer_read %arg0[%c0, %c0], %cf0 : tensor<4x4xf32>, vector<4x4xf32>
+  return %res : vector<4x4xf32>
 }
 
+// -----
+
 // CHECK-LABEL: func @transfer_write_unroll_tensor
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[S0:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTW0:.*]] = vector.transfer_write %[[S0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   %[[S1:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTW1:.*]] = vector.transfer_write %[[S1]], %[[VTW0]][%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   %[[S2:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTW2:.*]] = vector.transfer_write %[[S2]], %[[VTW1]][%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   %[[S3:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTW3:.*]] = vector.transfer_write %[[S3]], %[[VTW2]][%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   return %[[VTW3]] : tensor<4x4xf32>
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[S0:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VTW0:.*]] = vector.transfer_write %[[S0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    %[[S1:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VTW1:.*]] = vector.transfer_write %[[S1]], %[[VTW0]][%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    %[[S2:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VTW2:.*]] = vector.transfer_write %[[S2]], %[[VTW1]][%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    %[[S3:.*]] = vector.extract_strided_slice %{{.*}} {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VTW3:.*]] = vector.transfer_write %[[S3]], %[[VTW2]][%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    return %[[VTW3]] : tensor<4x4xf32>
 
 func.func @transfer_write_unroll_tensor(%arg0 : tensor<4x4xf32>,
-  %arg1 : vector<4x4xf32>) -> tensor<4x4xf32> {
+  %vec : vector<4x4xf32>) -> tensor<4x4xf32> {
   %c0 = arith.constant 0 : index
-  %r = vector.transfer_write %arg1, %arg0[%c0, %c0] :
+  %res = vector.transfer_write %vec, %arg0[%c0, %c0] :
     vector<4x4xf32>, tensor<4x4xf32>
-  return %r: tensor<4x4xf32>
+  return %res: tensor<4x4xf32>
 }
 
+// -----
+
 // CHECK-LABEL: func @transfer_readwrite_unroll_tensor
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VTW0:.*]] = vector.transfer_write %[[VTR0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   %[[VTW1:.*]] = vector.transfer_write %[[VTR1]], %[[VTW0]][%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   %[[VTW2:.*]] = vector.transfer_write %[[VTR2]], %[[VTW1]][%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   %[[VTW3:.*]] = vector.transfer_write %[[VTR3]], %[[VTW2]][%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
-//  CHECK-NEXT:   return %[[VTW3]] : tensor<4x4xf32>
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VTW0:.*]] = vector.transfer_write %[[VTR0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    %[[VTW1:.*]] = vector.transfer_write %[[VTR1]], %[[VTW0]][%[[C0]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    %[[VTW2:.*]] = vector.transfer_write %[[VTR2]], %[[VTW1]][%[[C2]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    %[[VTW3:.*]] = vector.transfer_write %[[VTR3]], %[[VTW2]][%[[C2]], %[[C2]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32>
+// CHECK-NEXT:    return %[[VTW3]] : tensor<4x4xf32>
 
 func.func @transfer_readwrite_unroll_tensor(%arg0 : tensor<4x4xf32>, %arg1 : tensor<4x4xf32>) ->
   tensor<4x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
   %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 : tensor<4x4xf32>, vector<4x4xf32>
-  %r = vector.transfer_write %0, %arg1[%c0, %c0] : vector<4x4xf32>, tensor<4x4xf32>
-  return %r: tensor<4x4xf32>
+  %res = vector.transfer_write %0, %arg1[%c0, %c0] : vector<4x4xf32>, tensor<4x4xf32>
+  return %res: tensor<4x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @transfer_read_unroll_permutation
-//       CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [0, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [2, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   return %[[VEC5]] : vector<4x6xf32>
+// CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [0, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [2, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    return %[[VEC5]] : vector<4x6xf32>
 #map0 = affine_map<(d0, d1) -> (d1, d0)>
-func.func @transfer_read_unroll_permutation(%arg0 : memref<6x4xf32>) -> vector<4x6xf32> {
+func.func @transfer_read_unroll_permutation(%mem : memref<6x4xf32>) -> vector<4x6xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32>
-  return %0 : vector<4x6xf32>
+  %res = vector.transfer_read %mem[%c0, %c0], %cf0 {permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32>
+  return %res : vector<4x6xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @transfer_read_unroll_broadcast
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   return %[[VEC5]] : vector<6x4xf32>
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    return %[[VEC5]] : vector<6x4xf32>
 #map0 = affine_map<(d0, d1) -> (0, d1)>
-func.func @transfer_read_unroll_broadcast(%arg0 : memref<6x4xf32>) -> vector<6x4xf32> {
+func.func @transfer_read_unroll_broadcast(%mem : memref<6x4xf32>) -> vector<6x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32>
-  return %0 : vector<6x4xf32>
+  %res = vector.transfer_read %mem[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<6x4xf32>
+  return %res : vector<6x4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @transfer_read_unroll_broadcast_permuation
-//       CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [0, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [2, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
-//  CHECK-NEXT:   return %[[VEC5]] : vector<4x6xf32>
+// CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [0, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C4]], %[[C0]]], %{{.*}} : memref<6x4xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [2, 4], strides = [1, 1]} : vector<2x2xf32> into vector<4x6xf32>
+// CHECK-NEXT:    return %[[VEC5]] : vector<4x6xf32>
 #map0 = affine_map<(d0, d1) -> (0, d0)>
-func.func @transfer_read_unroll_broadcast_permuation(%arg0 : memref<6x4xf32>) -> vector<4x6xf32> {
+func.func @transfer_read_unroll_broadcast_permuation(%mem : memref<6x4xf32>) -> vector<4x6xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32>
-  return %0 : vector<4x6xf32>
+  %res = vector.transfer_read %mem[%c0, %c0], %cf0 {in_bounds = [true, false], permutation_map = #map0} : memref<6x4xf32>, vector<4x6xf32>
+  return %res : vector<4x6xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @transfer_read_unroll_different_rank
-//       CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//       CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   return %[[VEC5]] : vector<6x4xf32>
-
-// ORDER-LABEL: func @transfer_read_unroll_different_rank
-//       ORDER-DAG:   %[[C4:.*]] = arith.constant 4 : index
-//       ORDER-DAG:   %[[C2:.*]] = arith.constant 2 : index
-//       ORDER-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       ORDER:   %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   return %[[VEC5]] : vector<6x4xf32>
+// ALL-LABEL:   func @transfer_read_unroll_different_rank
+// CHECK-DAG:     %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    return %[[VEC5]] : vector<6x4xf32>
+
+// ORDER-DAG:     %[[C4:.*]] = arith.constant 4 : index
+// ORDER-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// ORDER-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// ORDER:         %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VTR0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VTR1]], %[[VEC0]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VTR2]], %[[VEC1]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[VTR3:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C0]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VTR3]], %[[VEC2]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[VTR4:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C2]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VTR4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[VTR5:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]], %[[C4]]], %{{.*}} : memref<?x?x?xf32>, vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VTR5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    return %[[VEC5]] : vector<6x4xf32>
 
 #map0 = affine_map<(d0, d1, d2) -> (d2, d0)>
-func.func @transfer_read_unroll_different_rank(%arg0 : memref<?x?x?xf32>) -> vector<6x4xf32> {
+func.func @transfer_read_unroll_different_rank(%mem : memref<?x?x?xf32>) -> vector<6x4xf32> {
   %c0 = arith.constant 0 : index
   %cf0 = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %cf0 {permutation_map = #map0} : memref<?x?x?xf32>, vector<6x4xf32>
-  return %0 : vector<6x4xf32>
+  %res = vector.transfer_read %mem[%c0, %c0, %c0], %cf0 {permutation_map = #map0} : memref<?x?x?xf32>, vector<6x4xf32>
+  return %res : vector<6x4xf32>
 }
 
 // -----
 
-// CHECK-LABEL: func @vector_gather_unroll
-//  CHECK-SAME:           %[[ARG0:.*]]: memref<?x?x?xf32>
-//  CHECK-SAME:           %[[ARG1:.*]]: vector<6x4xindex>
-//  CHECK-SAME:           %[[ARG2:.*]]: vector<6x4xi1>
-//  CHECK-SAME:           %[[ARG3:.*]]: vector<6x4xf32>
-//  CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[IDX0:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  CHECK-NEXT:   %[[MASK0:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  CHECK-NEXT:   %[[PASS0:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VGT0:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX0]]], %[[MASK0]], %[[PASS0]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VGT0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[IDX1:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  CHECK-NEXT:   %[[MASK1:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  CHECK-NEXT:   %[[PASS1:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VGT1:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX1]]], %[[MASK1]], %[[PASS1]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VGT1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[IDX2:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  CHECK-NEXT:   %[[MASK2:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  CHECK-NEXT:   %[[PASS2:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VGT2:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX2]]], %[[MASK2]], %[[PASS2]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VGT2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[IDX3:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  CHECK-NEXT:   %[[MASK3:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  CHECK-NEXT:   %[[PASS3:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VGT3:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX3]]], %[[MASK3]], %[[PASS3]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VGT3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[IDX4:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  CHECK-NEXT:   %[[MASK4:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  CHECK-NEXT:   %[[PASS4:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VGT4:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX4]]], %[[MASK4]], %[[PASS4]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VGT4]], %[[VEC3]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   %[[IDX5:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  CHECK-NEXT:   %[[MASK5:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  CHECK-NEXT:   %[[PASS5:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  CHECK-NEXT:   %[[VGT5:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX5]]], %[[MASK5]], %[[PASS5]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  CHECK-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VGT5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  CHECK-NEXT:   return %[[VEC5]] : vector<6x4xf32>
-
-// ORDER-LABEL: func @vector_gather_unroll
-//  ORDER-SAME:           %[[ARG0:.*]]: memref<?x?x?xf32>
-//  ORDER-SAME:           %[[ARG1:.*]]: vector<6x4xindex>
-//  ORDER-SAME:           %[[ARG2:.*]]: vector<6x4xi1>
-//  ORDER-SAME:           %[[ARG3:.*]]: vector<6x4xf32>
-//  ORDER-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//       ORDER:   %[[IDX0:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  ORDER-NEXT:   %[[MASK0:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  ORDER-NEXT:   %[[PASS0:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   %[[VGT0:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX0]]], %[[MASK0]], %[[PASS0]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC0:.*]] = vector.insert_strided_slice %[[VGT0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[IDX1:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  ORDER-NEXT:   %[[MASK1:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  ORDER-NEXT:   %[[PASS1:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   %[[VGT1:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX1]]], %[[MASK1]], %[[PASS1]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC1:.*]] = vector.insert_strided_slice %[[VGT1]], %[[VEC0]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[IDX2:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  ORDER-NEXT:   %[[MASK2:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  ORDER-NEXT:   %[[PASS2:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   %[[VGT2:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX2]]], %[[MASK2]], %[[PASS2]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC2:.*]] = vector.insert_strided_slice %[[VGT2]], %[[VEC1]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[IDX3:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  ORDER-NEXT:   %[[MASK3:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  ORDER-NEXT:   %[[PASS3:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   %[[VGT3:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX3]]], %[[MASK3]], %[[PASS3]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC3:.*]] = vector.insert_strided_slice %[[VGT3]], %[[VEC2]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[IDX4:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  ORDER-NEXT:   %[[MASK4:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  ORDER-NEXT:   %[[PASS4:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   %[[VGT4:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX4]]], %[[MASK4]], %[[PASS4]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC4:.*]] = vector.insert_strided_slice %[[VGT4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   %[[IDX5:.*]] = vector.extract_strided_slice %[[ARG1]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
-//  ORDER-NEXT:   %[[MASK5:.*]] = vector.extract_strided_slice %[[ARG2]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
-//  ORDER-NEXT:   %[[PASS5:.*]] = vector.extract_strided_slice %[[ARG3]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
-//  ORDER-NEXT:   %[[VGT5:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX5]]], %[[MASK5]], %[[PASS5]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
-//  ORDER-NEXT:   %[[VEC5:.*]] = vector.insert_strided_slice %[[VGT5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
-//  ORDER-NEXT:   return %[[VEC5]] : vector<6x4xf32>
-
-func.func @vector_gather_unroll(%arg0 : memref<?x?x?xf32>,
+// ALL-LABEL:   func @vector_gather_unroll
+// ALL-SAME:      %[[MEM:.*]]: memref<?x?x?xf32>
+// ALL-SAME:      %[[INDICES:.*]]: vector<6x4xindex>
+// ALL-SAME:      %[[MASK:.*]]: vector<6x4xi1>
+// ALL-SAME:      %[[PT:.*]]: vector<6x4xf32>
+
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         %[[IDX0:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// CHECK-NEXT:    %[[MASK0:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// CHECK-NEXT:    %[[PASS0:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VGT0:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX0]]], %[[MASK0]], %[[PASS0]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VGT0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[IDX1:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// CHECK-NEXT:    %[[MASK1:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// CHECK-NEXT:    %[[PASS1:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VGT1:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX1]]], %[[MASK1]], %[[PASS1]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VGT1]], %[[VEC0]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[IDX2:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// CHECK-NEXT:    %[[MASK2:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// CHECK-NEXT:    %[[PASS2:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VGT2:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX2]]], %[[MASK2]], %[[PASS2]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VGT2]], %[[VEC1]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[IDX3:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// CHECK-NEXT:    %[[MASK3:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// CHECK-NEXT:    %[[PASS3:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VGT3:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX3]]], %[[MASK3]], %[[PASS3]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VGT3]], %[[VEC2]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[IDX4:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// CHECK-NEXT:    %[[MASK4:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// CHECK-NEXT:    %[[PASS4:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VGT4:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX4]]], %[[MASK4]], %[[PASS4]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VGT4]], %[[VEC3]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    %[[IDX5:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// CHECK-NEXT:    %[[MASK5:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// CHECK-NEXT:    %[[PASS5:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// CHECK-NEXT:    %[[VGT5:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX5]]], %[[MASK5]], %[[PASS5]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// CHECK-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VGT5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// CHECK-NEXT:    return %[[VEC5]] : vector<6x4xf32>
+
+// ORDER-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// ORDER:         %[[IDX0:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// ORDER-NEXT:    %[[MASK0:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// ORDER-NEXT:    %[[PASS0:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [0, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    %[[VGT0:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX0]]], %[[MASK0]], %[[PASS0]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC0:.*]] = vector.insert_strided_slice %[[VGT0]], %{{.*}} {offsets = [0, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[IDX1:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// ORDER-NEXT:    %[[MASK1:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// ORDER-NEXT:    %[[PASS1:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [2, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    %[[VGT1:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX1]]], %[[MASK1]], %[[PASS1]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC1:.*]] = vector.insert_strided_slice %[[VGT1]], %[[VEC0]] {offsets = [2, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[IDX2:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// ORDER-NEXT:    %[[MASK2:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// ORDER-NEXT:    %[[PASS2:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [4, 0], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    %[[VGT2:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX2]]], %[[MASK2]], %[[PASS2]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC2:.*]] = vector.insert_strided_slice %[[VGT2]], %[[VEC1]] {offsets = [4, 0], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[IDX3:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// ORDER-NEXT:    %[[MASK3:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// ORDER-NEXT:    %[[PASS3:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [0, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    %[[VGT3:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX3]]], %[[MASK3]], %[[PASS3]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC3:.*]] = vector.insert_strided_slice %[[VGT3]], %[[VEC2]] {offsets = [0, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[IDX4:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// ORDER-NEXT:    %[[MASK4:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// ORDER-NEXT:    %[[PASS4:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    %[[VGT4:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX4]]], %[[MASK4]], %[[PASS4]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC4:.*]] = vector.insert_strided_slice %[[VGT4]], %[[VEC3]] {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    %[[IDX5:.*]] = vector.extract_strided_slice %[[INDICES]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xindex> to vector<2x2xindex>
+// ORDER-NEXT:    %[[MASK5:.*]] = vector.extract_strided_slice %[[MASK]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xi1> to vector<2x2xi1>
+// ORDER-NEXT:    %[[PASS5:.*]] = vector.extract_strided_slice %[[PT]] {offsets = [4, 2], sizes = [2, 2], strides = [1, 1]} : vector<6x4xf32> to vector<2x2xf32>
+// ORDER-NEXT:    %[[VGT5:.*]] = vector.gather {{.*}}[%[[C0]], %[[C0]], %[[C0]]] [%[[IDX5]]], %[[MASK5]], %[[PASS5]] : memref<?x?x?xf32>, vector<2x2xindex>, vector<2x2xi1>, vector<2x2xf32> into vector<2x2xf32>
+// ORDER-NEXT:    %[[VEC5:.*]] = vector.insert_strided_slice %[[VGT5]], %[[VEC4]] {offsets = [4, 2], strides = [1, 1]} : vector<2x2xf32> into vector<6x4xf32>
+// ORDER-NEXT:    return %[[VEC5]] : vector<6x4xf32>
+
+func.func @vector_gather_unroll(%mem : memref<?x?x?xf32>,
                                 %indices : vector<6x4xindex>,
                                 %mask : vector<6x4xi1>,
                                 %pass_thru : vector<6x4xf32>) -> vector<6x4xf32> {
   %c0 = arith.constant 0 : index
-  %0 = vector.gather %arg0[%c0, %c0, %c0] [%indices], %mask, %pass_thru : memref<?x?x?xf32>, vector<6x4xindex>, vector<6x4xi1>, vector<6x4xf32> into vector<6x4xf32>
-  return %0 : vector<6x4xf32>
+  %res = vector.gather %mem[%c0, %c0, %c0] [%indices], %mask, %pass_thru : memref<?x?x?xf32>, vector<6x4xindex>, vector<6x4xi1>, vector<6x4xf32> into vector<6x4xf32>
+  return %res : vector<6x4xf32>
 }
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index fbdf725f3ec17b..8453983aa07c33 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -1295,11 +1295,17 @@ llvm.func @complexintconstant() -> !llvm.struct<(i32, i32)> {
 }
 
 llvm.func @complexintconstantsplat() -> !llvm.array<2 x !llvm.struct<(i32, i32)>> {
-  %1 = llvm.mlir.constant(dense<(0, 1)> : tensor<complex<i32>>) : !llvm.array<2 x !llvm.struct<(i32, i32)>>
+  %1 = llvm.mlir.constant(dense<(0, 1)> : tensor<2xcomplex<i32>>) : !llvm.array<2 x !llvm.struct<(i32, i32)>>
   // CHECK: ret [2 x { i32, i32 }] [{ i32, i32 } { i32 0, i32 1 }, { i32, i32 } { i32 0, i32 1 }]
   llvm.return %1 : !llvm.array<2 x !llvm.struct<(i32, i32)>>
 }
 
+llvm.func @complexintconstantsingle() -> !llvm.array<1 x !llvm.struct<(i32, i32)>> {
+  %1 = llvm.mlir.constant(dense<(0, 1)> : tensor<complex<i32>>) : !llvm.array<1 x !llvm.struct<(i32, i32)>>
+  // CHECK: ret [1 x { i32, i32 }] [{ i32, i32 } { i32 0, i32 1 }]
+  llvm.return %1 : !llvm.array<1 x !llvm.struct<(i32, i32)>>
+}
+
 llvm.func @complexintconstantarray() -> !llvm.array<2 x !llvm.array<2 x !llvm.struct<(i32, i32)>>> {
   %1 = llvm.mlir.constant(dense<[[(0, 1), (2, 3)], [(4, 5), (6, 7)]]> : tensor<2x2xcomplex<i32>>) : !llvm.array<2 x!llvm.array<2 x !llvm.struct<(i32, i32)>>>
   // CHECK{LITERAL}: ret [2 x [2 x { i32, i32 }]] [[2 x { i32, i32 }] [{ i32, i32 } { i32 0, i32 1 }, { i32, i32 } { i32 2, i32 3 }], [2 x { i32, i32 }] [{ i32, i32 } { i32 4, i32 5 }, { i32, i32 } { i32 6, i32 7 }]]
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index 5b22bbaac144fe..4c1f7712249a3a 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -102,8 +102,9 @@ struct KernelArgsTy {
       0; // Tripcount for the teams / distribute loop, 0 otherwise.
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
-    uint64_t Unused : 63;
-  } Flags = {0, 0};
+    uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
+    uint64_t Unused : 62;
+  } Flags = {0, 0, 0};
   // The number of teams (for x,y,z dimension).
   uint32_t NumTeams[3] = {0, 0, 0};
    // The number of threads (for x,y,z dimension).
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 323dee41630f2f..2b6445e9fbe550 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {
 
 inline KernelArgsTy CTorDTorKernelArgs = {1,       0,       nullptr,   nullptr,
 	     nullptr, nullptr, nullptr,   nullptr,
-	     0,      {0,0},       {1, 0, 0}, {1, 0, 0}, 0};
+	     0,      {0,0,0},       {1, 0, 0}, {1, 0, 0}, 0};
 
 struct DeviceTy;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c3ecbcc62f71f1..84d946507ea74a 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -552,9 +552,17 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   if (!KernelLaunchEnvOrErr)
     return KernelLaunchEnvOrErr.takeError();
 
-  KernelLaunchParamsTy LaunchParams =
-      prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
-                  Ptrs, *KernelLaunchEnvOrErr);
+  KernelLaunchParamsTy LaunchParams;
+
+  // Kernel languages don't use indirection.
+  if (KernelArgs.Flags.IsCUDA) {
+    LaunchParams =
+        *reinterpret_cast<KernelLaunchParamsTy *>(KernelArgs.ArgPtrs);
+  } else {
+    LaunchParams =
+        prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs,
+                    Args, Ptrs, *KernelLaunchEnvOrErr);
+  }
 
   uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
   uint64_t NumBlocks =
diff --git a/offload/src/CMakeLists.txt b/offload/src/CMakeLists.txt
index 344069b6fcdcf6..c5f5d902fad14c 100644
--- a/offload/src/CMakeLists.txt
+++ b/offload/src/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(omptarget
   OpenMP/InteropAPI.cpp
   OpenMP/OMPT/Callback.cpp
 
+  KernelLanguage/API.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LIBOMPTARGET_INCLUDE_DIR}
diff --git a/offload/src/KernelLanguage/API.cpp b/offload/src/KernelLanguage/API.cpp
new file mode 100644
index 00000000000000..ef1aad829e7bd7
--- /dev/null
+++ b/offload/src/KernelLanguage/API.cpp
@@ -0,0 +1,73 @@
+//===------ API.cpp - Kernel Language (CUDA/HIP) entry points ----- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Shared/APITypes.h"
+
+#include <cstdio>
+
+struct dim3 {
+  unsigned x = 0, y = 0, z = 0;
+};
+
+struct __omp_kernel_t {
+  dim3 __grid_size;
+  dim3 __block_size;
+  size_t __shared_memory;
+
+  void *__stream;
+};
+
+static __omp_kernel_t __current_kernel = {};
+#pragma omp threadprivate(__current_kernel);
+
+extern "C" {
+
+// TODO: There is little reason we need to keep these names or the way calls are
+// issued. For now we do to avoid modifying Clang's CUDA codegen. Unclear when
+// we actually need to push/pop configurations.
+unsigned __llvmPushCallConfiguration(dim3 __grid_size, dim3 __block_size,
+                                     size_t __shared_memory, void *__stream) {
+  __omp_kernel_t &__kernel = __current_kernel;
+  __kernel.__grid_size = __grid_size;
+  __kernel.__block_size = __block_size;
+  __kernel.__shared_memory = __shared_memory;
+  __kernel.__stream = __stream;
+  return 0;
+}
+
+unsigned __llvmPopCallConfiguration(dim3 *__grid_size, dim3 *__block_size,
+                                    size_t *__shared_memory, void *__stream) {
+  __omp_kernel_t &__kernel = __current_kernel;
+  *__grid_size = __kernel.__grid_size;
+  *__block_size = __kernel.__block_size;
+  *__shared_memory = __kernel.__shared_memory;
+  *((void **)__stream) = __kernel.__stream;
+  return 0;
+}
+
+int __tgt_target_kernel(void *Loc, int64_t DeviceId, int32_t NumTeams,
+                        int32_t ThreadLimit, const void *HostPtr,
+                        KernelArgsTy *Args);
+
+unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
+                          void *args, size_t sharedMem, void *stream) {
+  KernelArgsTy Args = {};
+  Args.DynCGroupMem = sharedMem;
+  Args.NumTeams[0] = gridDim.x;
+  Args.NumTeams[1] = gridDim.y;
+  Args.NumTeams[2] = gridDim.z;
+  Args.ThreadLimit[0] = blockDim.x;
+  Args.ThreadLimit[1] = blockDim.y;
+  Args.ThreadLimit[2] = blockDim.z;
+  Args.ArgPtrs = reinterpret_cast<void **>(args);
+  Args.Flags.IsCUDA = true;
+  return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);
+}
+}
diff --git a/offload/src/exports b/offload/src/exports
index f95544ec8329c8..7bdc7d2a531bb3 100644
--- a/offload/src/exports
+++ b/offload/src/exports
@@ -71,6 +71,9 @@ VERS1.0 {
     __tgt_interop_use;
     __tgt_interop_destroy;
     ompt_libomptarget_connect;
+    __llvmPushCallConfiguration;
+    __llvmPopCallConfiguration;
+    llvmLaunchKernel;
   local:
     *;
 };
diff --git a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c
index 0b4d9d6ea9d46e..1aaec06659a7b5 100644
--- a/offload/test/api/omp_dynamic_shared_memory_amdgpu.c
+++ b/offload/test/api/omp_dynamic_shared_memory_amdgpu.c
@@ -1,4 +1,5 @@
-// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mllvm \
+// RUN:   -openmp-opt-inline-device
 // RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
 // RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
 // REQUIRES: amdgcn-amd-amdhsa
diff --git a/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c b/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
index 656c3a20aaf82a..82f4b2b28a384f 100644
--- a/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
+++ b/offload/test/api/omp_dynamic_shared_memory_mixed_amdgpu.c
@@ -1,4 +1,5 @@
-// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O1 -mllvm -openmp-opt-inline-device -I %S
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mllvm \
+// RUN:   -openmp-opt-inline-device -I %S
 // RUN: env LIBOMPTARGET_NEXTGEN_PLUGINS=1 \
 // RUN:   %libomptarget-run-amdgcn-amd-amdhsa | %fcheck-amdgcn-amd-amdhsa
 // REQUIRES: amdgcn-amd-amdhsa
diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg
index 6d6c77db80e16c..b4fc7d3b333b35 100644
--- a/offload/test/lit.cfg
+++ b/offload/test/lit.cfg
@@ -66,7 +66,7 @@ def evaluate_bool_env(env):
 config.name = 'libomptarget :: ' + config.libomptarget_current_target
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.cc', '.f90']
+config.suffixes = ['.c', '.cpp', '.cc', '.f90', '.cu']
 
 # excludes: A list of directories to exclude from the testuites.
 config.excludes = ['Inputs']
diff --git a/offload/test/offloading/CUDA/basic_launch.cu b/offload/test/offloading/CUDA/basic_launch.cu
new file mode 100644
index 00000000000000..79f01f48b6c2ad
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch.cu
@@ -0,0 +1,32 @@
+// clang-format off
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp 
+// RUN: %t | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) { *A = 42; }
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 7;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  square<<<1, 1>>>(Ptr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
new file mode 100644
index 00000000000000..d4a6bc9ddfb3fa
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_blocks_and_threads.cu
@@ -0,0 +1,34 @@
+// clang-format off
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp 
+// RUN: %t | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *A) {
+  __scoped_atomic_fetch_add(A, 1, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 0;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 0
+  square<<<7, 6>>>(Ptr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
new file mode 100644
index 00000000000000..c11c194b5e0611
--- /dev/null
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -0,0 +1,43 @@
+// clang-format off
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t
+// RUN: %t | %fcheck-generic
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t -fopenmp 
+// RUN: %t | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+__global__ void square(int *Dst, short Q, int *Src, short P) {
+  *Dst = (Src[0] + Src[1]) * (Q + P);
+  Src[0] = Q;
+  Src[1] = P;
+}
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  int *Src = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(8, DevNo));
+  *Ptr = 7;
+  Src[0] = -2;
+  Src[1] = 8;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  printf("Src: %i : %i\n", Src[0], Src[1]);
+  // CHECK: Src: -2 : 8
+  square<<<1, 1>>>(Ptr, 3, Src, 4);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  printf("Src: %i : %i\n", Src[0], Src[1]);
+  // CHECK: Src: 3 : 4
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/CUDA/kernel_tu.cu.inc b/offload/test/offloading/CUDA/kernel_tu.cu.inc
new file mode 100644
index 00000000000000..d7d28a109dfc5a
--- /dev/null
+++ b/offload/test/offloading/CUDA/kernel_tu.cu.inc
@@ -0,0 +1 @@
+__global__ void square(int *A) { *A = 42; }
diff --git a/offload/test/offloading/CUDA/launch_tu.cu b/offload/test/offloading/CUDA/launch_tu.cu
new file mode 100644
index 00000000000000..aad3d509752376
--- /dev/null
+++ b/offload/test/offloading/CUDA/launch_tu.cu
@@ -0,0 +1,32 @@
+// clang-format off
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %s -o %t.launch_tu.o -c
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native -x cuda %S/kernel_tu.cu.inc -o %t.kernel_tu.o -c
+// RUN: %clang++ %flags -foffload-via-llvm --offload-arch=native %t.launch_tu.o %t.kernel_tu.o -o %t
+// RUN: %t | %fcheck-generic
+// clang-format on
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <stdio.h>
+
+extern "C" {
+void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum);
+}
+
+extern __global__ void square(int *A);
+
+int main(int argc, char **argv) {
+  int DevNo = 0;
+  int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo));
+  *Ptr = 7;
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr:0x.*]], *Ptr: 7
+  square<<<1, 1>>>(Ptr);
+  printf("Ptr %p, *Ptr: %i\n", Ptr, *Ptr);
+  // CHECK: Ptr [[Ptr]], *Ptr: 42
+  llvm_omp_target_free_shared(Ptr, DevNo);
+}
diff --git a/offload/test/offloading/bug51781.c b/offload/test/offloading/bug51781.c
index 35ecf55aa8c534..17b7499a7606e4 100644
--- a/offload/test/offloading/bug51781.c
+++ b/offload/test/offloading/bug51781.c
@@ -5,7 +5,7 @@
 
 // SPMDize.  There is no main thread, so there's no issue.
 //
-// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt > %t.spmd 2>&1
+// RUN: %libomptarget-compile-generic -O2 -Rpass=openmp-opt > %t.spmd 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=SPMD -input-file=%t.spmd
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=SPMD -input-file=%t.spmd
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
@@ -15,7 +15,7 @@
 // Use the custom state machine, which must avoid the same barrier problem as
 // the generic state machine.
 //
-// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt \
+// RUN: %libomptarget-compile-generic -O2 -Rpass=openmp-opt \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
@@ -24,7 +24,7 @@
 // Repeat with reduction clause, which has managed to break the custom state
 // machine in the past.
 //
-// RUN: %libomptarget-compile-generic -O1 -Rpass=openmp-opt -DADD_REDUCTION \
+// RUN: %libomptarget-compile-generic -O2 -Rpass=openmp-opt -DADD_REDUCTION \
 // RUN:   -mllvm -openmp-opt-disable-spmdization > %t.custom 2>&1
 // RUN: %fcheck-nvptx64-nvidia-cuda -check-prefix=CUSTOM -input-file=%t.custom
 // RUN: %fcheck-amdgcn-amd-amdhsa -check-prefix=CUSTOM -input-file=%t.custom
diff --git a/offload/test/offloading/bug51982.c b/offload/test/offloading/bug51982.c
index 91ce4a264e2382..b19707aacde983 100644
--- a/offload/test/offloading/bug51982.c
+++ b/offload/test/offloading/bug51982.c
@@ -1,6 +1,6 @@
-// RUN: %libomptarget-compile-generic -O1 && %libomptarget-run-generic
-// -O1 to run openmp-opt
-// RUN: %libomptarget-compileopt-generic -O1 && %libomptarget-run-generic
+// RUN: %libomptarget-compile-generic -O2 && %libomptarget-run-generic
+// -O2 to run openmp-opt
+// RUN: %libomptarget-compileopt-generic -O2 && %libomptarget-run-generic
 
 int main(void) {
   long int aa = 0;
diff --git a/utils/bazel/.bazelrc b/utils/bazel/.bazelrc
index 975667f8fa8118..67fb4f432787f8 100644
--- a/utils/bazel/.bazelrc
+++ b/utils/bazel/.bazelrc
@@ -6,6 +6,11 @@
 # Common flags that apply to all configurations.
 # Use sparingly for things common to all compilers and platforms.
 ###############################################################################
+
+# Flip off to disable MODULE.bazel until we're ready.
+# https://github.com/llvm/llvm-project/issues/55924
+common --enable_bzlmod=false
+
 # Prevent invalid caching if input files are modified during a build.
 build --experimental_guard_against_concurrent_changes
 
diff --git a/utils/bazel/.bazelversion b/utils/bazel/.bazelversion
index 5e3254243a3b27..1502020768a7b4 100644
--- a/utils/bazel/.bazelversion
+++ b/utils/bazel/.bazelversion
@@ -1 +1 @@
-6.1.2
+7.3.0