diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 6a69d02fa81753..b80bc30cfa0a21 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11150,7 +11150,7 @@ static bool getAArch64PBV(QualType QT, ASTContext &C) {
 /// as defined by `LS(P)` in 3.2.1 of the AAVFABI.
 /// TODO: Add support for references, section 3.2.1, item 1.
 static unsigned getAArch64LS(QualType QT, ParamKindTy Kind, ASTContext &C) {
-  if (getAArch64MTV(QT, Kind) && QT.getCanonicalType()->isPointerType()) {
+  if (!getAArch64MTV(QT, Kind) && QT.getCanonicalType()->isPointerType()) {
     QualType PTy = QT.getCanonicalType()->getPointeeType();
     if (getAArch64PBV(PTy, C))
       return C.getTypeSize(PTy);
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index bad796bf92dcfb..3c91a04d54642f 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -1051,8 +1051,7 @@ llvm::MDNode *CodeGenFunction::createProfileWeightsForLoop(const Stmt *Cond,
   if (!PGO.haveRegionCounts())
     return nullptr;
   Optional<uint64_t> CondCount = PGO.getStmtCount(Cond);
-  assert(CondCount.hasValue() && "missing expected loop condition count");
-  if (*CondCount == 0)
+  if (!CondCount || *CondCount == 0)
     return nullptr;
   return createProfileWeights(LoopCount,
                               std::max(*CondCount, LoopCount) - LoopCount);
diff --git a/clang/test/Lexer/case-insensitive-include-ms.c b/clang/test/Lexer/case-insensitive-include-ms.c
index cf14d2530d0161..f7af1fef8b4e6a 100644
--- a/clang/test/Lexer/case-insensitive-include-ms.c
+++ b/clang/test/Lexer/case-insensitive-include-ms.c
@@ -6,15 +6,17 @@
 // RUN: %clang_cc1 -fsyntax-only -fms-compatibility %s -include %s -I %t/Output -verify
 // RUN: %clang_cc1 -fsyntax-only -fms-compatibility -fdiagnostics-parseable-fixits %s -include %s -I %t/Output 2>&1 | FileCheck %s
 
-// FIXME: Add a test with repeated backslashes once clang can handle that
-// in ms-compat mode on non-Windows hosts.
 #include "..\Output\.\case-insensitive-include.h"
 #include "..\Output\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
 // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\""
+#include "..\\Output\.\\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:52}:"\"..\\\\Output\\.\\\\case-insensitive-include.h\""
 #include "..\output\.\case-insensitive-include.h" // expected-warning {{non-portable path}}
 // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\""
 
 #include "apath\..\.\case-insensitive-include.h"
 #include "apath\..\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
 // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:49}:"\"apath\\..\\.\\case-insensitive-include.h\""
+#include "apath\\..\\.\\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:52}:"\"apath\\\\..\\\\.\\\\case-insensitive-include.h\""
 #include "APath\..\.\case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
diff --git a/clang/test/OpenMP/aarch64_vfabi_NarrowestDataSize.c b/clang/test/OpenMP/aarch64_vfabi_NarrowestDataSize.c
new file mode 100644
index 00000000000000..d65c4edaeea709
--- /dev/null
+++ b/clang/test/OpenMP/aarch64_vfabi_NarrowestDataSize.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp      -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+// Note: -fopemp and -fopenmp-simd behavior are expected to be the same.
+
+// This test checks the values of Narrowest Data Size (NDS), as defined in
+// https://github.com/ARM-software/abi-aa/tree/master/vfabia64
+//
+// NDS is used to compute the <vlen> token in the name of AdvSIMD
+// vector functions when no `simdlen` is specified, with the rule:
+//
+// if NDS(f) = 1, then VLEN = 16, 8;
+// if NDS(f) = 2, then VLEN = 8, 4;
+// if NDS(f) = 4, then VLEN = 4, 2;
+// if NDS(f) = 8 or NDS(f) = 16, then VLEN = 2.
+
+// NDS(NDS_is_sizeof_char) = 1
+#pragma omp declare simd notinbranch
+char NDS_is_sizeof_char(short in);
+// CHECK-DAG: _ZGVnN16v_NDS_is_sizeof_char
+// CHECK-DAG: _ZGVnN8v_NDS_is_sizeof_char
+// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_char
+
+// NDS(NDS_is_sizeof_short) = 2
+#pragma omp declare simd notinbranch
+int NDS_is_sizeof_short(short in);
+// CHECK-DAG: _ZGVnN8v_NDS_is_sizeof_short
+// CHECK-DAG: _ZGVnN4v_NDS_is_sizeof_short
+// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_short
+
+// NDS(NDS_is_sizeof_float_with_linear) = 4, and not 2, because the pointers are
+// marked as `linear` and therefore the size of the pointee realizes
+// the NDS.
+#pragma omp declare simd linear(sin) notinbranch
+void NDS_is_sizeof_float_with_linear(double in, float *sin);
+// Neon accepts only power of 2 values as <vlen>.
+// CHECK-DAG: _ZGVnN4vl4_NDS_is_sizeof_float_with_linear
+// CHECK-DAG: _ZGVnN2vl4_NDS_is_sizeof_float_with_linear
+// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_float_with_linear
+
+// NDS(NDS_is_size_of_float) = 4
+#pragma omp declare simd notinbranch
+double NDS_is_size_of_float(float in);
+// CHECK-DAG: _ZGVnN4v_NDS_is_size_of_float
+// CHECK-DAG: _ZGVnN2v_NDS_is_size_of_float
+// CHECK-NOT: _ZGV{{.*}}_NDS_is_size_of_float
+
+// NDS(NDS_is_sizeof_double) = 8
+#pragma omp declare simd linear(sin) notinbranch
+void NDS_is_sizeof_double(double in, double *sin);
+// CHECK-DAG: _ZGVnN2vl8_NDS_is_sizeof_double
+// CHECK-NOT: _ZGV{{.*}}_NDS_is_sizeof_double
+
+// NDS(double_complex) = 16
+#pragma omp declare simd notinbranch
+double _Complex double_complex(double _Complex);
+// CHECK-DAG: _ZGVnN2v_double_complex
+// CHECK-NOT: _ZGV{{.*}}_double_complex
+
+// NDS(double_complex_linear_char) = 1, becasue `x` is marked linear.
+#pragma omp declare simd linear(x) notinbranch
+double _Complex double_complex_linear_char(double _Complex y, char *x);
+// CHECK-DAG: _ZGVnN8vl_double_complex_linear_char
+// CHECK-DAG: _ZGVnN16vl_double_complex_linear_char
+// CHECK-NOT: _ZGV{{.*}}_double_complex_linear_char
+
+static float *F;
+static double *D;
+static short S;
+static int I;
+static char C;
+static double _Complex DC;
+void do_something() {
+  C = NDS_is_sizeof_char(S);
+  I = NDS_is_sizeof_short(S);
+  NDS_is_sizeof_float_with_linear(*D, F);
+  *D = NDS_is_size_of_float(*F);
+  NDS_is_sizeof_double(*D, D);
+  DC = double_complex(DC);
+  DC = double_complex_linear_char(DC, &C);
+}
diff --git a/clang/test/OpenMP/aarch64_vfabi_WidestDataSize.c b/clang/test/OpenMP/aarch64_vfabi_WidestDataSize.c
new file mode 100644
index 00000000000000..841a64053e5e3b
--- /dev/null
+++ b/clang/test/OpenMP/aarch64_vfabi_WidestDataSize.c
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +sve  -fopenmp      -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +sve  -fopenmp-simd -x c -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+// Note: -fopemp and -fopenmp-simd behavior are expected to be the same.
+
+// This test checks the values of Widest Data Size (WDS), as defined
+// in https://github.com/ARM-software/abi-aa/tree/master/vfabia64
+//
+// WDS is used to check the accepted values <N> of `simdlen(<N>)` when
+// targeting fixed-length SVE vector function names. The values of
+// `<N>` that are accepted are such that for X = WDS * <N> * 8,
+// 128-bit <= X <= 2048-bit and X is a multiple of 128-bit.
+
+#pragma omp declare simd simdlen(8)
+#pragma omp declare simd simdlen(16)
+#pragma omp declare simd simdlen(256)
+#pragma omp declare simd simdlen(272)
+char WDS_is_sizeof_char(char in);
+// WDS = 1, simdlen(8) and simdlen(272) are not generated.
+// CHECK-DAG: _ZGVsM16v_WDS_is_sizeof_char
+// CHECK-DAG: _ZGVsM256v_WDS_is_sizeof_char
+// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_char
+
+#pragma omp declare simd simdlen(4)
+#pragma omp declare simd simdlen(8)
+#pragma omp declare simd simdlen(128)
+#pragma omp declare simd simdlen(136)
+char WDS_is_sizeof_short(short in);
+// WDS = 2, simdlen(4) and simdlen(136) are not generated.
+// CHECK-DAG: _ZGVsM8v_WDS_is_sizeof_short
+// CHECK-DAG: _ZGVsM128v_WDS_is_sizeof_short
+// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_short
+
+#pragma omp declare simd linear(sin) notinbranch simdlen(2)
+#pragma omp declare simd linear(sin) notinbranch simdlen(4)
+#pragma omp declare simd linear(sin) notinbranch simdlen(64)
+#pragma omp declare simd linear(sin) notinbranch simdlen(68)
+void WDS_is_sizeof_float_pointee(float in, float *sin);
+// WDS = 4, simdlen(2) and simdlen(68) are not generated.
+// CHECK-DAG: _ZGVsM4vl4_WDS_is_sizeof_float_pointee
+// CHECK-DAG: _ZGVsM64vl4_WDS_is_sizeof_float_pointee
+// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_float_pointee
+
+#pragma omp declare simd linear(sin) notinbranch simdlen(2)
+#pragma omp declare simd linear(sin) notinbranch simdlen(4)
+#pragma omp declare simd linear(sin) notinbranch simdlen(32)
+#pragma omp declare simd linear(sin) notinbranch simdlen(34)
+void WDS_is_sizeof_double_pointee(float in, double *sin);
+// WDS = 8 because of the linear clause, simdlen(34) is not generated.
+// CHECK-DAG: _ZGVsM2vl8_WDS_is_sizeof_double_pointee
+// CHECK-DAG: _ZGVsM4vl8_WDS_is_sizeof_double_pointee
+// CHECK-DAG: _ZGVsM32vl8_WDS_is_sizeof_double_pointee
+// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_double_pointee
+
+#pragma omp declare simd simdlen(2)
+#pragma omp declare simd simdlen(4)
+#pragma omp declare simd simdlen(32)
+#pragma omp declare simd simdlen(34)
+double WDS_is_sizeof_double(double in);
+// WDS = 8, simdlen(34) is not generated.
+// CHECK-DAG: _ZGVsM2v_WDS_is_sizeof_double
+// CHECK-DAG: _ZGVsM4v_WDS_is_sizeof_double
+// CHECK-DAG: _ZGVsM32v_WDS_is_sizeof_double
+// CHECK-NOT: _ZGV{{.*}}_WDS_is_sizeof_double
+
+static char C;
+static short S;
+static float F;
+static double D;
+
+void do_something() {
+  C = WDS_is_sizeof_char(C);
+  C = WDS_is_sizeof_short(S);
+  WDS_is_sizeof_float_pointee(F, &F);
+  WDS_is_sizeof_double_pointee(F, &D);
+  D = WDS_is_sizeof_double(D);
+}
diff --git a/flang/include/flang/Evaluate/check-expression.h b/flang/include/flang/Evaluate/check-expression.h
index a26f83b01bbbf4..b14a47838e3aaf 100644
--- a/flang/include/flang/Evaluate/check-expression.h
+++ b/flang/include/flang/Evaluate/check-expression.h
@@ -12,6 +12,7 @@
 #define FORTRAN_EVALUATE_CHECK_EXPRESSION_H_
 
 #include "expression.h"
+#include "intrinsics.h"
 #include "type.h"
 #include <optional>
 
@@ -41,24 +42,38 @@ bool IsInitialDataTarget(
 // Check whether an expression is a specification expression
 // (10.1.11(2), C1010).  Constant expressions are always valid
 // specification expressions.
+
+// There are two contexts where specification expressions appear -- array
+// bounds and type param expressions.  We need to differentiate them because
+// additional checks are required for array bounds expressions in declarations
+// of derived type components (see C750).
+ENUM_CLASS(SpecificationExprContext, TYPE_PARAM, BOUND)
+
 template <typename A>
-void CheckSpecificationExpr(
-    const A &, parser::ContextualMessages &, const semantics::Scope &);
+void CheckSpecificationExpr(const A &, parser::ContextualMessages &,
+    const semantics::Scope &, const IntrinsicProcTable &,
+    SpecificationExprContext);
 extern template void CheckSpecificationExpr(const Expr<SomeType> &x,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 extern template void CheckSpecificationExpr(const Expr<SomeInteger> &x,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 extern template void CheckSpecificationExpr(const Expr<SubscriptInteger> &x,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 extern template void CheckSpecificationExpr(
     const std::optional<Expr<SomeType>> &x, parser::ContextualMessages &,
-    const semantics::Scope &);
+    const semantics::Scope &, const IntrinsicProcTable &,
+    SpecificationExprContext);
 extern template void CheckSpecificationExpr(
     const std::optional<Expr<SomeInteger>> &x, parser::ContextualMessages &,
-    const semantics::Scope &);
+    const semantics::Scope &, const IntrinsicProcTable &,
+    SpecificationExprContext);
 extern template void CheckSpecificationExpr(
     const std::optional<Expr<SubscriptInteger>> &x,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 
 // Simple contiguity (9.5.4)
 template <typename A>
diff --git a/flang/include/flang/Evaluate/intrinsics.h b/flang/include/flang/Evaluate/intrinsics.h
index fc79638189193e..88d6a7af13eb75 100644
--- a/flang/include/flang/Evaluate/intrinsics.h
+++ b/flang/include/flang/Evaluate/intrinsics.h
@@ -55,6 +55,11 @@ struct SpecificIntrinsicFunctionInterface : public characteristics::Procedure {
   // All argument and result types are intrinsic types with default kinds.
 };
 
+// Generic intrinsic classes from table 16.1
+ENUM_CLASS(IntrinsicClass, atomicSubroutine, collectiveSubroutine,
+    elementalFunction, elementalSubroutine, inquiryFunction, pureSubroutine,
+    impureSubroutine, transformationalFunction, noClass)
+
 class IntrinsicProcTable {
 private:
   class Implementation;
@@ -68,6 +73,9 @@ class IntrinsicProcTable {
   // statement.
   bool IsIntrinsic(const std::string &) const;
 
+  // Inquiry intrinsics are defined in section 16.7, table 16.1
+  IntrinsicClass GetIntrinsicClass(const std::string &) const;
+
   // Probe the intrinsics for a match against a specific call.
   // On success, the actual arguments are transferred to the result
   // in dummy argument order; on failure, the actual arguments remain
diff --git a/flang/lib/Evaluate/check-expression.cpp b/flang/lib/Evaluate/check-expression.cpp
index 3f71cb6a1aeaf8..43686815ab3513 100644
--- a/flang/lib/Evaluate/check-expression.cpp
+++ b/flang/lib/Evaluate/check-expression.cpp
@@ -7,10 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Evaluate/check-expression.h"
+#include "flang/Evaluate/intrinsics.h"
 #include "flang/Evaluate/traverse.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
+#include <set>
+#include <string>
 
 namespace Fortran::evaluate {
 
@@ -171,6 +174,7 @@ class IsInitialDataTargetHelper
     return (*this)(x.left());
   }
   bool operator()(const Relational<SomeType> &) const { return false; }
+
 private:
   parser::ContextualMessages *messages_;
 };
@@ -187,8 +191,10 @@ class CheckSpecificationExprHelper
 public:
   using Result = std::optional<std::string>;
   using Base = AnyTraverse<CheckSpecificationExprHelper, Result>;
-  explicit CheckSpecificationExprHelper(const semantics::Scope &s)
-      : Base{*this}, scope_{s} {}
+  explicit CheckSpecificationExprHelper(const semantics::Scope &s,
+      const IntrinsicProcTable &table, SpecificationExprContext specExprContext)
+      : Base{*this}, scope_{s}, table_{table}, specExprContext_{
+                                                   specExprContext} {}
   using Base::operator();
 
   Result operator()(const ProcedureDesignator &) const {
@@ -199,6 +205,10 @@ class CheckSpecificationExprHelper
   Result operator()(const semantics::Symbol &symbol) const {
     if (semantics::IsNamedConstant(symbol)) {
       return std::nullopt;
+    } else if (scope_.IsDerivedType() && IsVariableName(symbol) &&
+        specExprContext_ == SpecificationExprContext::BOUND) { // C750
+      return "reference to variable '"s + symbol.name().ToString() +
+          "' not allowed for derived type components";
     } else if (symbol.IsDummy()) {
       if (symbol.attrs().test(semantics::Attr::OPTIONAL)) {
         return "reference to OPTIONAL dummy argument '"s +
@@ -243,16 +253,51 @@ class CheckSpecificationExprHelper
     return std::nullopt;
   }
 
+  template <int KIND>
+  Result operator()(const TypeParamInquiry<KIND> &inq) const {
+    if (scope_.IsDerivedType() && !IsConstantExpr(inq) &&
+        inq.parameter().owner() != scope_ &&
+        specExprContext_ == SpecificationExprContext::BOUND) { // C750
+      return "non-constant reference to a type parameter inquiry "
+             "not allowed for derived type components";
+    }
+    return std::nullopt;
+  }
+
   template <typename T> Result operator()(const FunctionRef<T> &x) const {
     if (const auto *symbol{x.proc().GetSymbol()}) {
       if (!semantics::IsPureProcedure(*symbol)) {
         return "reference to impure function '"s + symbol->name().ToString() +
             "'";
       }
+      if (semantics::IsStmtFunction(*symbol)) {
+        return "reference to statement function '"s +
+            symbol->name().ToString() + "'";
+      }
+      if (scope_.IsDerivedType() &&
+          specExprContext_ == SpecificationExprContext::BOUND) { // C750
+        return "reference to function '"s + symbol->name().ToString() +
+            "' not allowed for derived type components";
+      }
       // TODO: other checks for standard module procedures
     } else {
       const SpecificIntrinsic &intrin{DEREF(x.proc().GetSpecificIntrinsic())};
-      if (intrin.name == "present") {
+      if (scope_.IsDerivedType() &&
+          specExprContext_ == SpecificationExprContext::BOUND) { // C750
+        if ((table_.IsIntrinsic(intrin.name) &&
+                badIntrinsicsForComponents_.find(intrin.name) !=
+                    badIntrinsicsForComponents_.end()) ||
+            IsProhibitedFunction(intrin.name)) {
+          return "reference to intrinsic '"s + intrin.name +
+              "' not allowed for derived type components";
+        }
+        if (table_.GetIntrinsicClass(intrin.name) ==
+                IntrinsicClass::inquiryFunction &&
+            !IsConstantExpr(x)) {
+          return "non-constant reference to inquiry intrinsic '"s +
+              intrin.name + "' not allowed for derived type components";
+        }
+      } else if (intrin.name == "present") {
         return std::nullopt; // no need to check argument(s)
       }
       if (IsConstantExpr(x)) {
@@ -265,29 +310,42 @@ class CheckSpecificationExprHelper
 
 private:
   const semantics::Scope &scope_;
+  const IntrinsicProcTable &table_;
+  const SpecificationExprContext specExprContext_;
+  const std::set<std::string> badIntrinsicsForComponents_{
+      "allocated", "associated", "extends_type_of", "present", "same_type_as"};
+  static bool IsProhibitedFunction(std::string name) { return false; }
 };
 
 template <typename A>
 void CheckSpecificationExpr(const A &x, parser::ContextualMessages &messages,
-    const semantics::Scope &scope) {
-  if (auto why{CheckSpecificationExprHelper{scope}(x)}) {
+    const semantics::Scope &scope, const IntrinsicProcTable &table,
+    SpecificationExprContext specExprContext) {
+  if (auto why{
+          CheckSpecificationExprHelper{scope, table, specExprContext}(x)}) {
     messages.Say("Invalid specification expression: %s"_err_en_US, *why);
   }
 }
 
 template void CheckSpecificationExpr(const Expr<SomeType> &,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 template void CheckSpecificationExpr(const Expr<SomeInteger> &,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 template void CheckSpecificationExpr(const Expr<SubscriptInteger> &,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 template void CheckSpecificationExpr(const std::optional<Expr<SomeType>> &,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 template void CheckSpecificationExpr(const std::optional<Expr<SomeInteger>> &,
-    parser::ContextualMessages &, const semantics::Scope &);
+    parser::ContextualMessages &, const semantics::Scope &,
+    const IntrinsicProcTable &, SpecificationExprContext);
 template void CheckSpecificationExpr(
     const std::optional<Expr<SubscriptInteger>> &, parser::ContextualMessages &,
-    const semantics::Scope &);
+    const semantics::Scope &, const IntrinsicProcTable &,
+    SpecificationExprContext);
 
 // IsSimplyContiguous() -- 9.5.4
 class IsSimplyContiguousHelper
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index cbf082bd8ac5ab..605b100f42f3a7 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -229,6 +229,7 @@ struct IntrinsicInterface {
   IntrinsicDummyArgument dummy[maxArguments];
   TypePattern result;
   Rank rank{Rank::elemental};
+  IntrinsicClass intrinsicClass{IntrinsicClass::elementalFunction};
   std::optional<SpecificCall> Match(const CallCharacteristics &,
       const common::IntrinsicTypeDefaultKinds &, ActualArguments &,
       FoldingContext &context) const;
@@ -265,19 +266,21 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"aimag", {{"x", SameComplex}}, SameReal},
     {"aint", {{"a", SameReal}, MatchingDefaultKIND}, KINDReal},
     {"all", {{"mask", SameLogical, Rank::array}, OptionalDIM}, SameLogical,
-        Rank::dimReduced},
-    {"allocated", {{"array", AnyData, Rank::array}}, DefaultLogical},
-    {"allocated", {{"scalar", AnyData, Rank::scalar}}, DefaultLogical},
+        Rank::dimReduced, IntrinsicClass::transformationalFunction},
+    {"allocated", {{"array", AnyData, Rank::array}}, DefaultLogical,
+        Rank::elemental, IntrinsicClass::inquiryFunction},
+    {"allocated", {{"scalar", AnyData, Rank::scalar}}, DefaultLogical,
+        Rank::elemental, IntrinsicClass::inquiryFunction},
     {"anint", {{"a", SameReal}, MatchingDefaultKIND}, KINDReal},
     {"any", {{"mask", SameLogical, Rank::array}, OptionalDIM}, SameLogical,
-        Rank::dimReduced},
+        Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"asin", {{"x", SameFloating}}, SameFloating},
     {"asind", {{"x", SameFloating}}, SameFloating},
     {"asinh", {{"x", SameFloating}}, SameFloating},
     {"associated",
         {{"pointer", Addressable, Rank::known},
             {"target", Addressable, Rank::known, Optionality::optional}},
-        DefaultLogical},
+        DefaultLogical, Rank::elemental, IntrinsicClass::inquiryFunction},
     {"atan", {{"x", SameFloating}}, SameFloating},
     {"atand", {{"x", SameFloating}}, SameFloating},
     {"atan", {{"y", OperandReal}, {"x", OperandReal}}, OperandReal},
@@ -291,14 +294,14 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"bessel_jn",
         {{"n1", AnyInt, Rank::scalar}, {"n2", AnyInt, Rank::scalar},
             {"x", SameReal, Rank::scalar}},
-        SameReal, Rank::vector},
+        SameReal, Rank::vector, IntrinsicClass::transformationalFunction},
     {"bessel_y0", {{"x", SameReal}}, SameReal},
     {"bessel_y1", {{"x", SameReal}}, SameReal},
     {"bessel_yn", {{"n", AnyInt}, {"x", SameReal}}, SameReal},
     {"bessel_yn",
         {{"n1", AnyInt, Rank::scalar}, {"n2", AnyInt, Rank::scalar},
             {"x", SameReal, Rank::scalar}},
-        SameReal, Rank::vector},
+        SameReal, Rank::vector, IntrinsicClass::transformationalFunction},
     {"bge",
         {{"i", AnyInt, Rank::elementalOrBOZ},
             {"j", AnyInt, Rank::elementalOrBOZ}},
@@ -308,7 +311,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"j", AnyInt, Rank::elementalOrBOZ}},
         DefaultLogical},
     {"bit_size", {{"i", SameInt, Rank::anyOrAssumedRank}}, SameInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"ble",
         {{"i", AnyInt, Rank::elementalOrBOZ},
             {"j", AnyInt, Rank::elementalOrBOZ}},
@@ -327,34 +330,36 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"y", AnyIntOrReal, Rank::elementalOrBOZ, Optionality::optional},
             DefaultingKIND},
         KINDComplex},
-    {"command_argument_count", {}, DefaultInt, Rank::scalar},
+    {"command_argument_count", {}, DefaultInt, Rank::scalar,
+        IntrinsicClass::transformationalFunction},
     {"conjg", {{"z", SameComplex}}, SameComplex},
     {"cos", {{"x", SameFloating}}, SameFloating},
     {"cosd", {{"x", SameFloating}}, SameFloating},
     {"cosh", {{"x", SameFloating}}, SameFloating},
     {"count", {{"mask", AnyLogical, Rank::array}, OptionalDIM, DefaultingKIND},
-        KINDInt, Rank::dimReduced},
+        KINDInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"cshift",
         {{"array", SameType, Rank::array}, {"shift", AnyInt, Rank::dimRemoved},
             OptionalDIM},
-        SameType, Rank::conformable},
+        SameType, Rank::conformable, IntrinsicClass::transformationalFunction},
     {"dble", {{"a", AnyNumeric, Rank::elementalOrBOZ}}, DoublePrecision},
     {"digits", {{"x", AnyIntOrReal, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"dim", {{"x", OperandIntOrReal}, {"y", OperandIntOrReal}},
         OperandIntOrReal},
     {"dot_product",
         {{"vector_a", AnyLogical, Rank::vector},
             {"vector_b", AnyLogical, Rank::vector}},
-        ResultLogical, Rank::scalar},
+        ResultLogical, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"dot_product",
         {{"vector_a", AnyComplex, Rank::vector},
             {"vector_b", AnyNumeric, Rank::vector}},
-        ResultNumeric, Rank::scalar}, // conjugates vector_a
+        ResultNumeric, Rank::scalar, // conjugates vector_a
+        IntrinsicClass::transformationalFunction},
     {"dot_product",
         {{"vector_a", AnyIntOrReal, Rank::vector},
             {"vector_b", AnyNumeric, Rank::vector}},
-        ResultNumeric, Rank::scalar},
+        ResultNumeric, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"dprod", {{"x", DefaultReal}, {"y", DefaultReal}}, DoublePrecision},
     {"dshiftl",
         {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ},
@@ -372,68 +377,72 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"boundary", SameIntrinsic, Rank::dimRemoved,
                 Optionality::optional},
             OptionalDIM},
-        SameIntrinsic, Rank::conformable},
+        SameIntrinsic, Rank::conformable,
+        IntrinsicClass::transformationalFunction},
     {"eoshift",
         {{"array", SameDerivedType, Rank::array},
             {"shift", AnyInt, Rank::dimRemoved},
             {"boundary", SameDerivedType, Rank::dimRemoved}, OptionalDIM},
-        SameDerivedType, Rank::conformable},
+        SameDerivedType, Rank::conformable,
+        IntrinsicClass::transformationalFunction},
     {"epsilon", {{"x", SameReal, Rank::anyOrAssumedRank}}, SameReal,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"erf", {{"x", SameReal}}, SameReal},
     {"erfc", {{"x", SameReal}}, SameReal},
     {"erfc_scaled", {{"x", SameReal}}, SameReal},
     {"exp", {{"x", SameFloating}}, SameFloating},
+    {"exp", {{"x", SameFloating}}, SameFloating},
     {"exponent", {{"x", AnyReal}}, DefaultInt},
+    {"exp", {{"x", SameFloating}}, SameFloating},
     {"extends_type_of",
         {{"a", ExtensibleDerived, Rank::anyOrAssumedRank},
             {"mold", ExtensibleDerived, Rank::anyOrAssumedRank}},
-        DefaultLogical, Rank::scalar},
+        DefaultLogical, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"findloc",
         {{"array", AnyNumeric, Rank::array},
             {"value", AnyNumeric, Rank::scalar}, RequiredDIM, OptionalMASK,
             SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::dimRemoved},
+        KINDInt, Rank::dimRemoved, IntrinsicClass::transformationalFunction},
     {"findloc",
         {{"array", AnyNumeric, Rank::array},
             {"value", AnyNumeric, Rank::scalar}, OptionalMASK, SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::vector},
+        KINDInt, Rank::vector, IntrinsicClass::transformationalFunction},
     {"findloc",
         {{"array", SameChar, Rank::array}, {"value", SameChar, Rank::scalar},
             RequiredDIM, OptionalMASK, SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::dimRemoved},
+        KINDInt, Rank::dimRemoved, IntrinsicClass::transformationalFunction},
     {"findloc",
         {{"array", SameChar, Rank::array}, {"value", SameChar, Rank::scalar},
             OptionalMASK, SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::vector},
+        KINDInt, Rank::vector, IntrinsicClass::transformationalFunction},
     {"findloc",
         {{"array", AnyLogical, Rank::array},
             {"value", AnyLogical, Rank::scalar}, RequiredDIM, OptionalMASK,
             SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::dimRemoved},
+        KINDInt, Rank::dimRemoved, IntrinsicClass::transformationalFunction},
     {"findloc",
         {{"array", AnyLogical, Rank::array},
             {"value", AnyLogical, Rank::scalar}, OptionalMASK, SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::vector},
+        KINDInt, Rank::vector, IntrinsicClass::transformationalFunction},
     {"floor", {{"a", AnyReal}, DefaultingKIND}, KINDInt},
     {"fraction", {{"x", SameReal}}, SameReal},
     {"gamma", {{"x", SameReal}}, SameReal},
     {"huge", {{"x", SameIntOrReal, Rank::anyOrAssumedRank}}, SameIntOrReal,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"hypot", {{"x", OperandReal}, {"y", OperandReal}}, OperandReal},
     {"iachar", {{"c", AnyChar}, DefaultingKIND}, KINDInt},
     {"iall", {{"array", SameInt, Rank::array}, OptionalDIM, OptionalMASK},
-        SameInt, Rank::dimReduced},
+        SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"iany", {{"array", SameInt, Rank::array}, OptionalDIM, OptionalMASK},
-        SameInt, Rank::dimReduced},
+        SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"iparity", {{"array", SameInt, Rank::array}, OptionalDIM, OptionalMASK},
-        SameInt, Rank::dimReduced},
+        SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"iand", {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ}}, SameInt},
     {"iand", {{"i", BOZ}, {"j", SameInt}}, SameInt},
     {"ibclr", {{"i", SameInt}, {"pos", AnyInt}}, SameInt},
@@ -461,19 +470,20 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"size", AnyInt, Rank::elemental, Optionality::optional}},
         SameInt},
     {"is_contiguous", {{"array", Addressable, Rank::anyOrAssumedRank}},
-        DefaultLogical},
+        DefaultLogical, Rank::elemental, IntrinsicClass::inquiryFunction},
     {"is_iostat_end", {{"i", AnyInt}}, DefaultLogical},
     {"is_iostat_eor", {{"i", AnyInt}}, DefaultLogical},
-    {"kind", {{"x", AnyIntrinsic}}, DefaultInt},
+    {"kind", {{"x", AnyIntrinsic}}, DefaultInt, Rank::elemental,
+        IntrinsicClass::inquiryFunction},
     {"lbound",
         {{"array", AnyData, Rank::anyOrAssumedRank}, RequiredDIM,
             SizeDefaultKIND},
-        KINDInt, Rank::scalar},
+        KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"lbound", {{"array", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND},
-        KINDInt, Rank::vector},
+        KINDInt, Rank::vector, IntrinsicClass::inquiryFunction},
     {"leadz", {{"i", AnyInt}}, DefaultInt},
     {"len", {{"string", AnyChar, Rank::anyOrAssumedRank}, DefaultingKIND},
-        KINDInt, Rank::scalar},
+        KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"len_trim", {{"string", AnyChar}, DefaultingKIND}, KINDInt},
     {"lge", {{"string_a", SameChar}, {"string_b", SameChar}}, DefaultLogical},
     {"lgt", {{"string_a", SameChar}, {"string_b", SameChar}}, DefaultLogical},
@@ -488,27 +498,27 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"matmul",
         {{"array_a", AnyLogical, Rank::vector},
             {"array_b", AnyLogical, Rank::matrix}},
-        ResultLogical, Rank::vector},
+        ResultLogical, Rank::vector, IntrinsicClass::transformationalFunction},
     {"matmul",
         {{"array_a", AnyLogical, Rank::matrix},
             {"array_b", AnyLogical, Rank::vector}},
-        ResultLogical, Rank::vector},
+        ResultLogical, Rank::vector, IntrinsicClass::transformationalFunction},
     {"matmul",
         {{"array_a", AnyLogical, Rank::matrix},
             {"array_b", AnyLogical, Rank::matrix}},
-        ResultLogical, Rank::matrix},
+        ResultLogical, Rank::matrix, IntrinsicClass::transformationalFunction},
     {"matmul",
         {{"array_a", AnyNumeric, Rank::vector},
             {"array_b", AnyNumeric, Rank::matrix}},
-        ResultNumeric, Rank::vector},
+        ResultNumeric, Rank::vector, IntrinsicClass::transformationalFunction},
     {"matmul",
         {{"array_a", AnyNumeric, Rank::matrix},
             {"array_b", AnyNumeric, Rank::vector}},
-        ResultNumeric, Rank::vector},
+        ResultNumeric, Rank::vector, IntrinsicClass::transformationalFunction},
     {"matmul",
         {{"array_a", AnyNumeric, Rank::matrix},
             {"array_b", AnyNumeric, Rank::matrix}},
-        ResultNumeric, Rank::matrix},
+        ResultNumeric, Rank::matrix, IntrinsicClass::transformationalFunction},
     {"maskl", {{"i", AnyInt}, DefaultingKIND}, KINDInt},
     {"maskr", {{"i", AnyInt}, DefaultingKIND}, KINDInt},
     {"max",
@@ -520,15 +530,16 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"a3", SameChar, Rank::elemental, Optionality::repeats}},
         SameChar},
     {"maxexponent", {{"x", AnyReal, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"maxloc",
         {{"array", AnyRelatable, Rank::array}, OptionalDIM, OptionalMASK,
             SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::dimReduced},
+        KINDInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"maxval",
         {{"array", SameRelatable, Rank::array}, OptionalDIM, OptionalMASK},
-        SameRelatable, Rank::dimReduced},
+        SameRelatable, Rank::dimReduced,
+        IntrinsicClass::transformationalFunction},
     {"merge",
         {{"tsource", SameType}, {"fsource", SameType}, {"mask", AnyLogical}},
         SameType},
@@ -548,25 +559,26 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"a3", SameChar, Rank::elemental, Optionality::repeats}},
         SameChar},
     {"minexponent", {{"x", AnyReal, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"minloc",
         {{"array", AnyRelatable, Rank::array}, OptionalDIM, OptionalMASK,
             SizeDefaultKIND,
             {"back", AnyLogical, Rank::scalar, Optionality::optional}},
-        KINDInt, Rank::dimReduced},
+        KINDInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"minval",
         {{"array", SameRelatable, Rank::array}, OptionalDIM, OptionalMASK},
-        SameRelatable, Rank::dimReduced},
+        SameRelatable, Rank::dimReduced,
+        IntrinsicClass::transformationalFunction},
     {"mod", {{"a", OperandIntOrReal}, {"p", OperandIntOrReal}},
         OperandIntOrReal},
     {"modulo", {{"a", OperandIntOrReal}, {"p", OperandIntOrReal}},
         OperandIntOrReal},
     {"nearest", {{"x", SameReal}, {"s", AnyReal}}, SameReal},
     {"new_line", {{"x", SameChar, Rank::anyOrAssumedRank}}, SameChar,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"nint", {{"a", AnyReal}, DefaultingKIND}, KINDInt},
     {"norm2", {{"x", SameReal, Rank::array}, OptionalDIM}, SameReal,
-        Rank::dimReduced},
+        Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"not", {{"i", SameInt}}, SameInt},
     // NULL() is a special case handled in Probe() below
     {"out_of_range",
@@ -581,24 +593,25 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
         {{"array", SameType, Rank::array},
             {"mask", AnyLogical, Rank::conformable},
             {"vector", SameType, Rank::vector, Optionality::optional}},
-        SameType, Rank::vector},
+        SameType, Rank::vector, IntrinsicClass::transformationalFunction},
     {"parity", {{"mask", SameLogical, Rank::array}, OptionalDIM}, SameLogical,
-        Rank::dimReduced},
+        Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"popcnt", {{"i", AnyInt}}, DefaultInt},
     {"poppar", {{"i", AnyInt}}, DefaultInt},
     {"product",
         {{"array", SameNumeric, Rank::array}, OptionalDIM, OptionalMASK},
-        SameNumeric, Rank::dimReduced},
+        SameNumeric, Rank::dimReduced,
+        IntrinsicClass::transformationalFunction},
     {"precision", {{"x", AnyFloating, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"present", {{"a", Addressable, Rank::anyOrAssumedRank}}, DefaultLogical,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"radix", {{"x", AnyIntOrReal, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
     {"range", {{"x", AnyNumeric, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
-    {"rank", {{"a", AnyData, Rank::anyOrAssumedRank}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::inquiryFunction},
+    {"rank", {{"a", AnyData, Rank::anyOrAssumedRank}}, DefaultInt, Rank::scalar,
+        IntrinsicClass::inquiryFunction},
     {"real", {{"a", SameComplex, Rank::elemental}},
         SameReal}, // 16.9.160(4)(ii)
     {"real", {{"a", AnyNumeric, Rank::elementalOrBOZ}, DefaultingKIND},
@@ -608,19 +621,19 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             {"operation", SameType, Rank::reduceOperation}, OptionalDIM,
             OptionalMASK, {"identity", SameType, Rank::scalar},
             {"ordered", AnyLogical, Rank::scalar, Optionality::optional}},
-        SameType, Rank::dimReduced},
+        SameType, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"repeat", {{"string", SameChar, Rank::scalar}, {"ncopies", AnyInt}},
-        SameChar, Rank::scalar},
+        SameChar, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"reshape",
         {{"source", SameType, Rank::array}, {"shape", AnyInt, Rank::shape},
             {"pad", SameType, Rank::array, Optionality::optional},
             {"order", AnyInt, Rank::vector, Optionality::optional}},
-        SameType, Rank::shaped},
+        SameType, Rank::shaped, IntrinsicClass::transformationalFunction},
     {"rrspacing", {{"x", SameReal}}, SameReal},
     {"same_type_as",
         {{"a", ExtensibleDerived, Rank::anyOrAssumedRank},
             {"b", ExtensibleDerived, Rank::anyOrAssumedRank}},
-        DefaultLogical, Rank::scalar},
+        DefaultLogical, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"scale", {{"x", SameReal}, {"i", AnyInt}}, SameReal},
     {"scan",
         {{"string", SameChar}, {"set", SameChar},
@@ -628,27 +641,27 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
             DefaultingKIND},
         KINDInt},
     {"selected_char_kind", {{"name", DefaultChar, Rank::scalar}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::transformationalFunction},
     {"selected_int_kind", {{"r", AnyInt, Rank::scalar}}, DefaultInt,
-        Rank::scalar},
+        Rank::scalar, IntrinsicClass::transformationalFunction},
     {"selected_real_kind",
         {{"p", AnyInt, Rank::scalar},
             {"r", AnyInt, Rank::scalar, Optionality::optional},
             {"radix", AnyInt, Rank::scalar, Optionality::optional}},
-        DefaultInt, Rank::scalar},
+        DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"selected_real_kind",
         {{"p", AnyInt, Rank::scalar, Optionality::optional},
             {"r", AnyInt, Rank::scalar},
             {"radix", AnyInt, Rank::scalar, Optionality::optional}},
-        DefaultInt, Rank::scalar},
+        DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"selected_real_kind",
         {{"p", AnyInt, Rank::scalar, Optionality::optional},
             {"r", AnyInt, Rank::scalar, Optionality::optional},
             {"radix", AnyInt, Rank::scalar}},
-        DefaultInt, Rank::scalar},
+        DefaultInt, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"set_exponent", {{"x", SameReal}, {"i", AnyInt}}, SameReal},
     {"shape", {{"source", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND},
-        KINDInt, Rank::vector},
+        KINDInt, Rank::vector, IntrinsicClass::inquiryFunction},
     {"shifta", {{"i", SameInt}, {"shift", AnyInt}}, SameInt},
     {"shiftl", {{"i", SameInt}, {"shift", AnyInt}}, SameInt},
     {"shiftr", {{"i", SameInt}, {"shift", AnyInt}}, SameInt},
@@ -659,45 +672,49 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
     {"size",
         {{"array", AnyData, Rank::anyOrAssumedRank}, OptionalDIM,
             SizeDefaultKIND},
-        KINDInt, Rank::scalar},
+        KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"spacing", {{"x", SameReal}}, SameReal},
     {"spread",
         {{"source", SameType, Rank::known}, RequiredDIM,
             {"ncopies", AnyInt, Rank::scalar}},
-        SameType, Rank::rankPlus1},
+        SameType, Rank::rankPlus1, IntrinsicClass::transformationalFunction},
     {"sqrt", {{"x", SameFloating}}, SameFloating},
     {"storage_size", {{"a", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND},
-        KINDInt, Rank::scalar},
+        KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"sum", {{"array", SameNumeric, Rank::array}, OptionalDIM, OptionalMASK},
-        SameNumeric, Rank::dimReduced},
+        SameNumeric, Rank::dimReduced,
+        IntrinsicClass::transformationalFunction},
     {"tan", {{"x", SameFloating}}, SameFloating},
     {"tand", {{"x", SameFloating}}, SameFloating},
     {"tanh", {{"x", SameFloating}}, SameFloating},
-    {"tiny", {{"x", SameReal, Rank::anyOrAssumedRank}}, SameReal, Rank::scalar},
+    {"tiny", {{"x", SameReal, Rank::anyOrAssumedRank}}, SameReal, Rank::scalar,
+        IntrinsicClass::inquiryFunction},
     {"trailz", {{"i", AnyInt}}, DefaultInt},
     {"transfer",
         {{"source", AnyData, Rank::known}, {"mold", SameType, Rank::scalar}},
-        SameType, Rank::scalar},
+        SameType, Rank::scalar, IntrinsicClass::transformationalFunction},
     {"transfer",
         {{"source", AnyData, Rank::known}, {"mold", SameType, Rank::array}},
-        SameType, Rank::vector},
+        SameType, Rank::vector, IntrinsicClass::transformationalFunction},
     {"transfer",
         {{"source", AnyData, Rank::anyOrAssumedRank},
             {"mold", SameType, Rank::anyOrAssumedRank},
             {"size", AnyInt, Rank::scalar}},
-        SameType, Rank::vector},
-    {"transpose", {{"matrix", SameType, Rank::matrix}}, SameType, Rank::matrix},
-    {"trim", {{"string", SameChar, Rank::scalar}}, SameChar, Rank::scalar},
+        SameType, Rank::vector, IntrinsicClass::transformationalFunction},
+    {"transpose", {{"matrix", SameType, Rank::matrix}}, SameType, Rank::matrix,
+        IntrinsicClass::transformationalFunction},
+    {"trim", {{"string", SameChar, Rank::scalar}}, SameChar, Rank::scalar,
+        IntrinsicClass::transformationalFunction},
     {"ubound",
         {{"array", AnyData, Rank::anyOrAssumedRank}, RequiredDIM,
             SizeDefaultKIND},
-        KINDInt, Rank::scalar},
+        KINDInt, Rank::scalar, IntrinsicClass::inquiryFunction},
     {"ubound", {{"array", AnyData, Rank::anyOrAssumedRank}, SizeDefaultKIND},
-        KINDInt, Rank::vector},
+        KINDInt, Rank::vector, IntrinsicClass::inquiryFunction},
     {"unpack",
         {{"vector", SameType, Rank::vector}, {"mask", AnyLogical, Rank::array},
             {"field", SameType, Rank::conformable}},
-        SameType, Rank::conformable},
+        SameType, Rank::conformable, IntrinsicClass::transformationalFunction},
     {"verify",
         {{"string", SameChar}, {"set", SameChar},
             {"back", AnyLogical, Rank::elemental, Optionality::optional},
@@ -900,33 +917,34 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{
 };
 
 static const IntrinsicInterface intrinsicSubroutine[]{
-    {"cpu_time", {{"time", AnyReal, Rank::scalar}}, {}},
+    {"cpu_time", {{"time", AnyReal, Rank::scalar}}, {}, Rank::elemental,
+        IntrinsicClass::impureSubroutine},
     {"date_and_time",
         {{"date", DefaultChar, Rank::scalar, Optionality::optional},
             {"time", DefaultChar, Rank::scalar, Optionality::optional},
             {"zone", DefaultChar, Rank::scalar, Optionality::optional},
             {"values", AnyInt, Rank::vector, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"execute_command_line",
         {{"command", DefaultChar, Rank::scalar},
             {"wait", AnyLogical, Rank::scalar, Optionality::optional},
             {"exitstat", AnyInt, Rank::scalar, Optionality::optional},
             {"cmdstat", AnyInt, Rank::scalar, Optionality::optional},
             {"cmdmsg", DefaultChar, Rank::scalar, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"get_command",
         {{"command", DefaultChar, Rank::scalar, Optionality::optional},
             {"length", AnyInt, Rank::scalar, Optionality::optional},
             {"status", AnyInt, Rank::scalar, Optionality::optional},
             {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"get_command_argument",
         {{"number", AnyInt, Rank::scalar},
             {"value", DefaultChar, Rank::scalar, Optionality::optional},
             {"length", AnyInt, Rank::scalar, Optionality::optional},
             {"status", AnyInt, Rank::scalar, Optionality::optional},
             {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"get_environment_variable",
         {{"name", DefaultChar, Rank::scalar},
             {"value", DefaultChar, Rank::scalar, Optionality::optional},
@@ -934,31 +952,34 @@ static const IntrinsicInterface intrinsicSubroutine[]{
             {"status", AnyInt, Rank::scalar, Optionality::optional},
             {"trim_name", AnyLogical, Rank::scalar, Optionality::optional},
             {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"move_alloc",
         {{"from", SameType, Rank::known}, {"to", SameType, Rank::known},
             {"stat", AnyInt, Rank::scalar, Optionality::optional},
             {"errmsg", DefaultChar, Rank::scalar, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::pureSubroutine},
     {"mvbits",
         {{"from", SameInt}, {"frompos", AnyInt}, {"len", AnyInt},
             {"to", SameInt}, {"topos", AnyInt}},
-        {}}, // elemental
+        {}, Rank::elemental, IntrinsicClass::elementalSubroutine}, // elemental
     {"random_init",
         {{"repeatable", AnyLogical, Rank::scalar},
             {"image_distinct", AnyLogical, Rank::scalar}},
-        {}},
-    {"random_number", {{"harvest", AnyReal, Rank::known}}, {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
+    {"random_number", {{"harvest", AnyReal, Rank::known}}, {}, Rank::elemental,
+        IntrinsicClass::impureSubroutine},
     {"random_seed",
         {{"size", DefaultInt, Rank::scalar, Optionality::optional},
             {"put", DefaultInt, Rank::vector, Optionality::optional},
             {"get", DefaultInt, Rank::vector, Optionality::optional}},
-        {}}, // TODO: at most one argument can be present
+        {}, Rank::elemental,
+        IntrinsicClass::impureSubroutine}, // TODO: at most one argument can be
+                                           // present
     {"system_clock",
         {{"count", AnyInt, Rank::scalar, Optionality::optional},
             {"count_rate", AnyIntOrReal, Rank::scalar, Optionality::optional},
             {"count_max", AnyInt, Rank::scalar, Optionality::optional}},
-        {}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
 };
 
 // TODO: Intrinsic subroutine EVENT_QUERY
@@ -1532,6 +1553,8 @@ class IntrinsicProcTable::Implementation {
 
   bool IsIntrinsic(const std::string &) const;
 
+  IntrinsicClass GetIntrinsicClass(const std::string &) const;
+
   std::optional<SpecificCall> Probe(const CallCharacteristics &,
       ActualArguments &, FoldingContext &, const IntrinsicProcTable &) const;
 
@@ -1571,6 +1594,23 @@ bool IntrinsicProcTable::Implementation::IsIntrinsic(
   return name == "null" || name == "__builtin_c_f_pointer";
 }
 
+IntrinsicClass IntrinsicProcTable::Implementation::GetIntrinsicClass(
+    const std::string &name) const {
+  auto specificIntrinsic{specificFuncs_.find(name)};
+  if (specificIntrinsic != specificFuncs_.end()) {
+    return specificIntrinsic->second->intrinsicClass;
+  }
+  auto genericIntrinsic{genericFuncs_.find(name)};
+  if (genericIntrinsic != genericFuncs_.end()) {
+    return genericIntrinsic->second->intrinsicClass;
+  }
+  auto subrIntrinsic{subroutines_.find(name)};
+  if (subrIntrinsic != subroutines_.end()) {
+    return subrIntrinsic->second->intrinsicClass;
+  }
+  return IntrinsicClass::noClass;
+}
+
 bool CheckAndRearrangeArguments(ActualArguments &arguments,
     parser::ContextualMessages &messages, const char *const dummyKeywords[],
     std::size_t trailingOptionals) {
@@ -2014,6 +2054,11 @@ bool IntrinsicProcTable::IsIntrinsic(const std::string &name) const {
   return DEREF(impl_).IsIntrinsic(name);
 }
 
+IntrinsicClass IntrinsicProcTable::GetIntrinsicClass(
+    const std::string &name) const {
+  return DEREF(impl_).GetIntrinsicClass(name);
+}
+
 std::optional<SpecificCall> IntrinsicProcTable::Probe(
     const CallCharacteristics &call, ActualArguments &arguments,
     FoldingContext &context) const {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index da02b4fbe47f3e..edbd01d4eca07c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -33,7 +33,10 @@ class CheckHelper {
 
   void Check() { Check(context_.globalScope()); }
   void Check(const ParamValue &, bool canBeAssumed);
-  void Check(const Bound &bound) { CheckSpecExpr(bound.GetExplicit()); }
+  void Check(const Bound &bound) {
+    CheckSpecExpr(
+        bound.GetExplicit(), evaluate::SpecificationExprContext::BOUND);
+  }
   void Check(const ShapeSpec &spec) {
     Check(spec.lbound());
     Check(spec.ubound());
@@ -44,7 +47,9 @@ class CheckHelper {
   void Check(const Scope &);
 
 private:
-  template <typename A> void CheckSpecExpr(const A &x) {
+  template <typename A>
+  void CheckSpecExpr(
+      const A &x, const evaluate::SpecificationExprContext specExprContext) {
     if (symbolBeingChecked_ && IsSaved(*symbolBeingChecked_)) {
       if (!evaluate::IsConstantExpr(x)) {
         messages_.Say(
@@ -52,18 +57,23 @@ class CheckHelper {
             symbolBeingChecked_->name());
       }
     } else {
-      evaluate::CheckSpecificationExpr(x, messages_, DEREF(scope_));
+      evaluate::CheckSpecificationExpr(
+          x, messages_, DEREF(scope_), context_.intrinsics(), specExprContext);
     }
   }
-  template <typename A> void CheckSpecExpr(const std::optional<A> &x) {
+  template <typename A>
+  void CheckSpecExpr(const std::optional<A> &x,
+      const evaluate::SpecificationExprContext specExprContext) {
     if (x) {
-      CheckSpecExpr(*x);
+      CheckSpecExpr(*x, specExprContext);
     }
   }
-  template <typename A> void CheckSpecExpr(A &x) {
+  template <typename A>
+  void CheckSpecExpr(
+      A &x, const evaluate::SpecificationExprContext specExprContext) {
     x = Fold(foldingContext_, std::move(x));
     const A &constx{x};
-    CheckSpecExpr(constx);
+    CheckSpecExpr(constx, specExprContext);
   }
   void CheckValue(const Symbol &, const DerivedTypeSpec *);
   void CheckVolatile(
@@ -131,7 +141,8 @@ void CheckHelper::Check(const ParamValue &value, bool canBeAssumed) {
           " external function result"_err_en_US);
     }
   } else {
-    CheckSpecExpr(value.GetExplicit());
+    CheckSpecExpr(
+        value.GetExplicit(), evaluate::SpecificationExprContext::TYPE_PARAM);
   }
 }
 
@@ -384,15 +395,25 @@ void CheckHelper::CheckObjectEntity(
   CheckAssumedTypeEntity(symbol, details);
   symbolBeingChecked_ = nullptr;
   if (!details.coshape().empty()) {
+    bool isDeferredShape{details.coshape().IsDeferredShape()};
     if (IsAllocatable(symbol)) {
-      if (!details.coshape().IsDeferredShape()) { // C827
-        messages_.Say(
-            "ALLOCATABLE coarray must have a deferred coshape"_err_en_US);
+      if (!isDeferredShape) { // C827
+        messages_.Say("'%s' is an ALLOCATABLE coarray and must have a deferred"
+                      " coshape"_err_en_US,
+            symbol.name());
       }
+    } else if (symbol.owner().IsDerivedType()) { // C746
+      std::string deferredMsg{
+          isDeferredShape ? "" : " and have a deferred coshape"};
+      messages_.Say("Component '%s' is a coarray and must have the ALLOCATABLE"
+                    " attribute%s"_err_en_US,
+          symbol.name(), deferredMsg);
     } else {
       if (!details.coshape().IsAssumedSize()) { // C828
         messages_.Say(
-            "Non-ALLOCATABLE coarray must have an explicit coshape"_err_en_US);
+            "Component '%s' is a non-ALLOCATABLE coarray and must have"
+            " an explicit coshape"_err_en_US,
+            symbol.name());
       }
     }
   }
@@ -409,7 +430,8 @@ void CheckHelper::CheckObjectEntity(
             "An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE"_err_en_US);
       }
     }
-    if (InPure() && !IsPointer(symbol) && !IsIntentIn(symbol) &&
+    if (InPure() && !IsStmtFunction(DEREF(innermostSymbol_)) &&
+        !IsPointer(symbol) && !IsIntentIn(symbol) &&
         !symbol.attrs().test(Attr::VALUE)) {
       if (InFunction()) { // C1583
         messages_.Say(
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 3431bc05392ef3..9306f702aabbd1 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2092,13 +2092,14 @@ std::optional<characteristics::Procedure> ExpressionAnalyzer::CheckCall(
     }
     semantics::CheckArguments(*chars, arguments, GetFoldingContext(),
         context_.FindScope(callSite), treatExternalAsImplicit);
-    if (!chars->attrs.test(characteristics::Procedure::Attr::Pure)) {
+    const Symbol *procSymbol{proc.GetSymbol()};
+    if (procSymbol && !IsPureProcedure(*procSymbol)) {
       if (const semantics::Scope *
           pure{semantics::FindPureProcedureContaining(
               context_.FindScope(callSite))}) {
         Say(callSite,
             "Procedure '%s' referenced in pure subprogram '%s' must be pure too"_err_en_US,
-            DEREF(proc.GetSymbol()).name(), DEREF(pure->symbol()).name());
+            procSymbol->name(), DEREF(pure->symbol()).name());
       }
     }
   }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 6d04c7f229ed15..e51c33988d0d7e 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -3679,7 +3679,7 @@ bool DeclarationVisitor::Pre(const parser::DerivedTypeDef &x) {
     if (symbol->has<TypeParamDetails>() && !paramNames.count(name)) {
       SayDerivedType(name,
           "'%s' is not a type parameter of this derived type"_err_en_US,
-          currScope()); // C742
+          currScope()); // C741
     }
   }
   Walk(std::get<std::list<parser::Statement<parser::PrivateOrSequence>>>(x.t));
@@ -3820,14 +3820,50 @@ void DeclarationVisitor::Post(const parser::ComponentDecl &x) {
       !attrs.HasAny({Attr::PUBLIC, Attr::PRIVATE})) {
     attrs.set(Attr::PRIVATE);
   }
-  if (!attrs.HasAny({Attr::POINTER, Attr::ALLOCATABLE})) {
-    if (const auto *declType{GetDeclTypeSpec()}) {
-      if (const auto *derived{declType->AsDerived()}) {
+  if (const auto *declType{GetDeclTypeSpec()}) {
+    if (const auto *derived{declType->AsDerived()}) {
+      if (!attrs.HasAny({Attr::POINTER, Attr::ALLOCATABLE})) {
         if (derivedTypeInfo_.type == &derived->typeSymbol()) { // C744
           Say("Recursive use of the derived type requires "
               "POINTER or ALLOCATABLE"_err_en_US);
         }
       }
+      if (!coarraySpec().empty()) { // C747
+        if (IsTeamType(derived)) {
+          Say("A coarray component may not be of type TEAM_TYPE from "
+              "ISO_FORTRAN_ENV"_err_en_US);
+        } else {
+          if (IsIsoCType(derived)) {
+            Say("A coarray component may not be of type C_PTR or C_FUNPTR from "
+                "ISO_C_BINDING"_err_en_US);
+          }
+        }
+      }
+      if (auto it{FindCoarrayUltimateComponent(*derived)}) { // C748
+        std::string ultimateName{it.BuildResultDesignatorName()};
+        // Strip off the leading "%"
+        if (ultimateName.length() > 1) {
+          ultimateName.erase(0, 1);
+          if (attrs.HasAny({Attr::POINTER, Attr::ALLOCATABLE})) {
+            evaluate::AttachDeclaration(
+                Say(name.source,
+                    "A component with a POINTER or ALLOCATABLE attribute may "
+                    "not "
+                    "be of a type with a coarray ultimate component (named "
+                    "'%s')"_err_en_US,
+                    ultimateName),
+                derived->typeSymbol());
+          }
+          if (!arraySpec().empty() || !coarraySpec().empty()) {
+            evaluate::AttachDeclaration(
+                Say(name.source,
+                    "An array or coarray component may not be of a type with a "
+                    "coarray ultimate component (named '%s')"_err_en_US,
+                    ultimateName),
+                derived->typeSymbol());
+          }
+        }
+      }
     }
   }
   if (OkToAddComponent(name)) {
@@ -4741,7 +4777,7 @@ Symbol *DeclarationVisitor::MakeTypeSymbol(
     const SourceName &name, Details &&details) {
   Scope &derivedType{currScope()};
   CHECK(derivedType.IsDerivedType());
-  if (auto *symbol{FindInScope(derivedType, name)}) {
+  if (auto *symbol{FindInScope(derivedType, name)}) { // C742
     Say2(name,
         "Type parameter, component, or procedure binding '%s'"
         " already defined in this type"_err_en_US,
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 249dcb27b65afc..3b68beaa557fc7 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -270,6 +270,24 @@ bool IsPureProcedure(const Symbol &symbol) {
   } else if (!IsProcedure(symbol)) {
     return false;
   }
+  if (IsStmtFunction(symbol)) {
+    // Section 15.7(1) states that a statement function is PURE if it does not
+    // reference an IMPURE procedure or a VOLATILE variable
+    const MaybeExpr &expr{symbol.get<SubprogramDetails>().stmtFunction()};
+    if (expr) {
+      for (const Symbol &refSymbol : evaluate::CollectSymbols(*expr)) {
+        if (IsFunction(refSymbol) && !IsPureProcedure(refSymbol)) {
+          return false;
+        }
+        if (const Symbol * root{GetAssociationRoot(refSymbol)}) {
+          if (root->attrs().test(Attr::VOLATILE)) {
+            return false;
+          }
+        }
+      }
+    }
+    return true; // statement function was not found to be impure
+  }
   return symbol.attrs().test(Attr::PURE) ||
       (symbol.attrs().test(Attr::ELEMENTAL) &&
           !symbol.attrs().test(Attr::IMPURE));
@@ -1356,4 +1374,5 @@ void LabelEnforce::SayWithConstruct(SemanticsContext &context,
   context.Say(stmtLocation, message)
       .Attach(constructLocation, GetEnclosingConstructMsg());
 }
+
 } // namespace Fortran::semantics
diff --git a/flang/test/Semantics/allocate11.f90 b/flang/test/Semantics/allocate11.f90
index 594bd1ded385f2..01b9944019ae39 100644
--- a/flang/test/Semantics/allocate11.f90
+++ b/flang/test/Semantics/allocate11.f90
@@ -38,6 +38,7 @@ subroutine C937(var)
 
   type B
     type(A) y
+    !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'y%x')
     type(B), pointer :: forward
     real :: u
   end type
@@ -47,6 +48,7 @@ subroutine C937(var)
   end type
 
   type D
+    !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'x')
     type(A), pointer :: potential
   end type
 
diff --git a/flang/test/Semantics/call12.f90 b/flang/test/Semantics/call12.f90
index e25a2608c44117..65da46b067d6cd 100644
--- a/flang/test/Semantics/call12.f90
+++ b/flang/test/Semantics/call12.f90
@@ -15,7 +15,7 @@ module m
     real, pointer :: p
   end type
   type :: hasCoarray
-    real :: co[*]
+    real, allocatable :: co[:]
   end type
  contains
   pure function test(ptr, in, hpd)
diff --git a/flang/test/Semantics/call14.f90 b/flang/test/Semantics/call14.f90
index b874e6b009125a..ee5086511de3b8 100644
--- a/flang/test/Semantics/call14.f90
+++ b/flang/test/Semantics/call14.f90
@@ -3,7 +3,7 @@
 
 module m
   type :: hasCoarray
-    real :: coarray[*]
+    real, allocatable :: coarray[:]
   end type
  contains
   !ERROR: VALUE attribute may apply only to a dummy data object
diff --git a/flang/test/Semantics/misc-declarations.f90 b/flang/test/Semantics/misc-declarations.f90
index 7680eed793bce1..f627836b3732c8 100644
--- a/flang/test/Semantics/misc-declarations.f90
+++ b/flang/test/Semantics/misc-declarations.f90
@@ -4,12 +4,12 @@
 ! - 8.5.19 constraints on the VOLATILE attribute
 
 module m
-  !ERROR: ALLOCATABLE coarray must have a deferred coshape
+  !ERROR: 'mustbedeferred' is an ALLOCATABLE coarray and must have a deferred coshape
   real, allocatable :: mustBeDeferred[*]  ! C827
-  !ERROR: Non-ALLOCATABLE coarray must have an explicit coshape
+  !ERROR: Component 'mustbeexplicit' is a non-ALLOCATABLE coarray and must have an explicit coshape
   real :: mustBeExplicit[:]  ! C828
   type :: hasCoarray
-    real :: coarray[*]
+    real, allocatable :: coarray[:]
   end type
   real :: coarray[*]
   type(hasCoarray) :: coarrayComponent
diff --git a/flang/test/Semantics/modfile24.f90 b/flang/test/Semantics/modfile24.f90
index ec446f9e8d3c39..45f6c0545627fe 100644
--- a/flang/test/Semantics/modfile24.f90
+++ b/flang/test/Semantics/modfile24.f90
@@ -36,8 +36,8 @@ module m2
 ! coarray-spec in components and with non-constants bounds
 module m3
   type t
-    real :: c[1:10,1:*]
-    complex, codimension[5,*] :: d
+    real, allocatable :: c[:,:]
+    complex, allocatable, codimension[:,:] :: d
   end type
   real, allocatable :: e[:,:,:]
 contains
@@ -50,8 +50,8 @@ subroutine s(a, b, n)
 !Expect: m3.mod
 !module m3
 ! type::t
-!  real(4)::c[1_8:10_8,1_8:*]
-!  complex(4)::d[1_8:5_8,1_8:*]
+!  real(4),allocatable::c[:,:]
+!  complex(4),allocatable::d[:,:]
 ! end type
 ! real(4),allocatable::e[:,:,:]
 !contains
diff --git a/flang/test/Semantics/resolve33.f90 b/flang/test/Semantics/resolve33.f90
index 3fa6bec15f2c19..7df5ba935ab0c3 100644
--- a/flang/test/Semantics/resolve33.f90
+++ b/flang/test/Semantics/resolve33.f90
@@ -2,6 +2,12 @@
 ! Derived type parameters
 ! C731 The same type-param-name shall not appear more than once in a given
 ! derived-type-stmt.
+! C741 A type-param-name in a type-param-def-stmt in a derived-type-def shall
+! be one of the type-paramnames in the derived-type-stmt of that
+! derived-type-def.
+! C742 Each type-param-name in the derived-type-stmt in a derived-type-def
+! shall appear exactly once as a type-param-name in a type-param-def-stmt 
+! in that derived-type-def.
 
 module m
   !ERROR: Duplicate type parameter name: 'a'
diff --git a/flang/test/Semantics/resolve44.f90 b/flang/test/Semantics/resolve44.f90
index 2d8b7017875372..41ab06ffb6c6ae 100644
--- a/flang/test/Semantics/resolve44.f90
+++ b/flang/test/Semantics/resolve44.f90
@@ -1,5 +1,8 @@
 ! RUN: %B/test/Semantics/test_errors.sh %s %flang %t
 ! Error tests for recursive use of derived types.
+! C744 If neither the POINTER nor the ALLOCATABLE attribute is specified, the
+! declaration-type-spec in the component-def-stmt shall specify an intrinsic
+! type or a previously defined derived type.
 
 program main
   type :: recursive1
diff --git a/flang/test/Semantics/resolve88.f90 b/flang/test/Semantics/resolve88.f90
new file mode 100644
index 00000000000000..50135297241c47
--- /dev/null
+++ b/flang/test/Semantics/resolve88.f90
@@ -0,0 +1,75 @@
+! RUN: %B/test/Semantics/test_errors.sh %s %flang %t
+! C746, C747, and C748
+module m
+  use ISO_FORTRAN_ENV
+  use ISO_C_BINDING
+
+  ! C746 If a coarray-spec appears, it shall be a deferred-coshape-spec-list and
+  ! the component shall have the ALLOCATABLE attribute.
+
+  type testCoArrayType
+    real, allocatable, codimension[:] :: allocatableField
+    !ERROR: Component 'deferredfield' is a coarray and must have the ALLOCATABLE attribute
+    real, codimension[:] :: deferredField
+    !ERROR: 'pointerfield' may not have the POINTER attribute because it is a coarray
+    !ERROR: Component 'pointerfield' is a coarray and must have the ALLOCATABLE attribute
+    real, pointer, codimension[:] :: pointerField
+    !ERROR: Component 'realfield' is a coarray and must have the ALLOCATABLE attribute and have a deferred coshape
+    real, codimension[*] :: realField
+    !ERROR: 'realfield2' is an ALLOCATABLE coarray and must have a deferred coshape
+    real, allocatable, codimension[*] :: realField2
+  end type testCoArrayType
+
+  ! C747 If a coarray-spec appears, the component shall not be of type C_PTR or
+  ! C_FUNPTR from the intrinsic module ISO_C_BINDING (18.2), or of type 
+  ! TEAM_TYPE from the intrinsic module ISO_FORTRAN_ENV (16.10.2).
+
+  type goodCoarrayType
+    real, allocatable, codimension[:] :: field
+  end type goodCoarrayType
+
+  type goodTeam_typeCoarrayType
+    type(team_type), allocatable :: field
+  end type goodTeam_typeCoarrayType
+
+  type goodC_ptrCoarrayType
+    type(c_ptr), allocatable :: field
+  end type goodC_ptrCoarrayType
+
+  type goodC_funptrCoarrayType
+    type(c_funptr), allocatable :: field
+  end type goodC_funptrCoarrayType
+
+  type team_typeCoarrayType
+    !ERROR: A coarray component may not be of type TEAM_TYPE from ISO_FORTRAN_ENV
+    type(team_type), allocatable, codimension[:] :: field
+  end type team_typeCoarrayType
+
+  type c_ptrCoarrayType
+    !ERROR: A coarray component may not be of type C_PTR or C_FUNPTR from ISO_C_BINDING
+    type(c_ptr), allocatable, codimension[:] :: field
+  end type c_ptrCoarrayType
+
+  type c_funptrCoarrayType
+    !ERROR: A coarray component may not be of type C_PTR or C_FUNPTR from ISO_C_BINDING
+    type(c_funptr), allocatable, codimension[:] :: field
+  end type c_funptrCoarrayType
+
+! C748 A data component whose type has a coarray ultimate component shall be a
+! nonpointer nonallocatable scalar and shall not be a coarray.
+
+  type coarrayType
+    real, allocatable, codimension[:] :: goodCoarrayField
+  end type coarrayType
+
+  type testType
+    type(coarrayType) :: goodField
+    !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'goodcoarrayfield')
+    type(coarrayType), pointer :: pointerField
+    !ERROR: A component with a POINTER or ALLOCATABLE attribute may not be of a type with a coarray ultimate component (named 'goodcoarrayfield')
+    type(coarrayType), allocatable :: allocatableField
+    !ERROR: An array or coarray component may not be of a type with a coarray ultimate component (named 'goodcoarrayfield')
+    type(coarrayType), dimension(3) :: arrayField
+  end type testType
+
+end module m
diff --git a/flang/test/Semantics/resolve89.f90 b/flang/test/Semantics/resolve89.f90
new file mode 100644
index 00000000000000..883970f30edf8e
--- /dev/null
+++ b/flang/test/Semantics/resolve89.f90
@@ -0,0 +1,110 @@
+! RUN: %B/test/Semantics/test_errors.sh %s %flang %t
+! C750 Each bound in the explicit-shape-spec shall be a specification
+! expression in which there are no references to specification functions or
+! the intrinsic functions ALLOCATED, ASSOCIATED, EXTENDS_- TYPE_OF, PRESENT,
+! or SAME_TYPE_AS, every specification inquiry reference is a constant
+! expression, and the value does not depend on the value of a variable.
+impure function impureFunc()
+  integer :: impureFunc
+
+  impureFunc = 3
+end function impureFunc
+
+pure function pureFunc()
+  integer :: pureFunc
+
+  pureFunc = 3
+end function pureFunc
+
+module m
+  real, allocatable :: mVar
+end module m
+
+subroutine s(iArg, allocArg, pointerArg, arrayArg, ioArg, optionalArg)
+  use m
+  implicit logical(l)
+  integer, intent(in) :: iArg
+  real, allocatable, intent(in) :: allocArg
+  real, pointer, intent(in) :: pointerArg
+  integer, dimension(:), intent(in) :: arrayArg
+  integer, intent(inout) :: ioArg
+  real, optional, intent(in) :: optionalArg
+
+  ! These declarations are OK since they're not in a derived type
+  real :: realVar
+  real, volatile :: volatileVar
+  real, dimension(merge(1, 2, allocated(allocArg))) :: realVar1
+  real, dimension(merge(1, 2, associated(pointerArg))) :: realVar2
+  real, dimension(merge(1, 2, is_contiguous(arrayArg))) :: realVar3
+  real, dimension(ioArg) :: realVar4
+  real, dimension(merge(1, 2, present(optionalArg))) :: realVar5
+
+  ! statement functions referenced below
+  iVolatileStmtFunc() = 3 * volatileVar
+  iImpureStmtFunc() = 3 * impureFunc()
+  iPureStmtFunc() = 3 * pureFunc()
+
+  ! This is OK
+  real, dimension(merge(1, 2, allocated(mVar))) :: rVar
+
+
+  integer :: var = 3
+    !ERROR: Invalid specification expression: reference to impure function 'ivolatilestmtfunc'
+  real, dimension(iVolatileStmtFunc()) :: arrayVarWithVolatile
+    !ERROR: Invalid specification expression: reference to impure function 'iimpurestmtfunc'
+  real, dimension(iImpureStmtFunc()) :: arrayVarWithImpureFunction
+    !ERROR: Invalid specification expression: reference to statement function 'ipurestmtfunc'
+  real, dimension(iPureStmtFunc()) :: arrayVarWithPureFunction
+  real, dimension(iabs(iArg)) :: arrayVarWithIntrinsic
+
+  type arrayType
+    !ERROR: Invalid specification expression: reference to variable 'var' not allowed for derived type components
+    real, dimension(var) :: varField
+    !ERROR: Invalid specification expression: reference to impure function 'ivolatilestmtfunc'
+    real, dimension(iVolatileStmtFunc()) :: arrayFieldWithVolatile
+    !ERROR: Invalid specification expression: reference to impure function 'iimpurestmtfunc'
+    real, dimension(iImpureStmtFunc()) :: arrayFieldWithImpureFunction
+    !ERROR: Invalid specification expression: reference to statement function 'ipurestmtfunc'
+    real, dimension(iPureStmtFunc()) :: arrayFieldWithPureFunction
+    !ERROR: Invalid specification expression: reference to variable 'iarg' not allowed for derived type components
+    real, dimension(iabs(iArg)) :: arrayFieldWithIntrinsic
+    !ERROR: Invalid specification expression: reference to intrinsic 'allocated' not allowed for derived type components
+    real, dimension(merge(1, 2, allocated(allocArg))) :: realField1
+    !ERROR: Invalid specification expression: reference to intrinsic 'associated' not allowed for derived type components
+    real, dimension(merge(1, 2, associated(pointerArg))) :: realField2
+    !ERROR: Invalid specification expression: non-constant reference to inquiry intrinsic 'is_contiguous' not allowed for derived type components
+    real, dimension(merge(1, 2, is_contiguous(arrayArg))) :: realField3
+    !ERROR: Invalid specification expression: reference to variable 'ioarg' not allowed for derived type components
+    real, dimension(ioArg) :: realField4
+    !ERROR: Invalid specification expression: reference to intrinsic 'present' not allowed for derived type components
+    real, dimension(merge(1, 2, present(optionalArg))) :: realField5
+  end type arrayType
+
+end subroutine s
+
+subroutine s1()
+  ! C750, check for a constant specification inquiry that's a type parameter
+  ! inquiry which are defined in 9.4.5
+  type derived(kindParam, lenParam)
+    integer, kind :: kindParam = 3
+    integer, len :: lenParam = 3
+  end type
+
+  contains
+    subroutine inner (derivedArg)
+      type(derived), intent(in), dimension(3) :: derivedArg
+      integer :: localInt
+
+      type(derived), parameter :: localderived = derived()
+
+      type localDerivedType
+        ! OK because the specification inquiry is a constant
+        integer, dimension(localDerived%kindParam) :: goodField
+        !ERROR: Invalid specification expression: non-constant reference to a type parameter inquiry not allowed for derived type components
+        integer, dimension(derivedArg%lenParam) :: badField
+      end type localDerivedType
+
+      ! OK because we're not defining a component
+      integer, dimension(derivedArg%kindParam) :: localVar
+    end subroutine inner
+end subroutine s1
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index 439f9710ef6600..c1aa851097b7ca 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -57,18 +57,14 @@
 config.substitutions.append(('%B', config.flang_obj_root))
 
 # For each occurrence of a flang tool name, replace it with the full path to
-# the build directory holding that tool.  We explicitly specify the directories
-# to search to ensure that we get the tools just built and not some random
-# tools that might happen to be in the user's PATH.
-tool_dirs = [config.llvm_tools_dir, config.flang_tools_dir]
-flang_includes = "-I" + config.flang_intrinsic_modules_dir
-
-tools = [ToolSubst('%flang', command=FindTool('flang'), unresolved='fatal'),
-         ToolSubst('%f18', command=FindTool('f18'), unresolved='fatal'),
-         ToolSubst('%f18_with_includes', command=FindTool('f18'),
-         extra_args=[flang_includes], unresolved='fatal')]
-
-llvm_config.add_tool_substitutions(tools, tool_dirs)
+# the build directory holding that tool.
+tools = [
+  ToolSubst('%flang', command=FindTool('flang'), unresolved='fatal'),
+  ToolSubst('%f18', command=FindTool('f18'), unresolved='fatal'),
+  ToolSubst('%f18_with_includes', command=FindTool('f18'),
+    extra_args=["-I" + config.flang_intrinsic_modules_dir], unresolved='fatal')
+]
+llvm_config.add_tool_substitutions(tools, [config.flang_llvm_tools_dir])
 
 # Enable libpgmath testing
 result = lit_config.params.get("LIBPGMATH")
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index 92bd926ab5cac5..e8e2945a2cbf0d 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -7,7 +7,7 @@ config.flang_obj_root = "@FLANG_BINARY_DIR@"
 config.flang_src_dir = "@FLANG_SOURCE_DIR@"
 config.flang_tools_dir = "@FLANG_TOOLS_DIR@"
 config.flang_intrinsic_modules_dir = "@FLANG_INTRINSIC_MODULES_DIR@"
-config.flang_llvm_tools_dir = "@LLVM_RUNTIME_OUTPUT_INTDIR@"
+config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
 config.python_executable = "@PYTHON_EXECUTABLE@"
 
 # Support substitution of the tools_dir with user parameters. This is
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
index c3c43c04edd766..86434b253befd5 100644
--- a/flang/tools/f18/CMakeLists.txt
+++ b/flang/tools/f18/CMakeLists.txt
@@ -1,7 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
 add_flang_tool(f18
   dump.cpp
   f18.cpp
-)
+  )
 
 target_link_libraries(f18
   PRIVATE
@@ -10,7 +13,6 @@ target_link_libraries(f18
   FortranEvaluate
   FortranSemantics
   FortranLower
-  LLVMSupport
 )
 
 set(MODULES
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 94c66cda1b0fea..54bba168cadf4e 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -76,7 +76,7 @@ TEST(CosfTest, InFloatRange) {
     float x = as_float(v);
     if (isnan(x) || isinf(x))
       continue;
-    EXPECT_TRUE(mpfr::equalsCos(x, __llvm_libc::cosf(x), tolerance));
+    ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, __llvm_libc::cosf(x), tolerance);
   }
 }
 
@@ -84,12 +84,12 @@ TEST(CosfTest, InFloatRange) {
 TEST(CosfTest, SmallValues) {
   float x = as_float(0x17800000);
   float result = __llvm_libc::cosf(x);
-  EXPECT_TRUE(mpfr::equalsCos(x, result, tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result, tolerance);
   EXPECT_EQ(FloatBits::One, as_uint32_bits(result));
 
-  x = as_float(0x00400000);
+  x = as_float(0x0040000);
   result = __llvm_libc::cosf(x);
-  EXPECT_TRUE(mpfr::equalsCos(x, result, tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result, tolerance);
   EXPECT_EQ(FloatBits::One, as_uint32_bits(result));
 }
 
@@ -98,6 +98,6 @@ TEST(CosfTest, SmallValues) {
 TEST(CosfTest, SDCOMP_26094) {
   for (uint32_t v : sdcomp26094Values) {
     float x = as_float(v);
-    EXPECT_TRUE(mpfr::equalsCos(x, __llvm_libc::cosf(x), tolerance));
+    ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, __llvm_libc::cosf(x), tolerance);
   }
 }
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 36e6b4a129a7cf..93b827a2ad374d 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -87,8 +87,8 @@ TEST(SinCosfTest, InFloatRange) {
 
     float sin, cos;
     __llvm_libc::sincosf(x, &sin, &cos);
-    EXPECT_TRUE(mpfr::equalsCos(x, cos, tolerance));
-    EXPECT_TRUE(mpfr::equalsSin(x, sin, tolerance));
+    ASSERT_MPFR_MATCH(mpfr::OP_Cos, x, cos, tolerance);
+    ASSERT_MPFR_MATCH(mpfr::OP_Sin, x, sin, tolerance);
   }
 }
 
@@ -98,16 +98,16 @@ TEST(SinCosfTest, SmallValues) {
   float x = as_float(bits);
   float result_cos, result_sin;
   __llvm_libc::sincosf(x, &result_sin, &result_cos);
-  EXPECT_TRUE(mpfr::equalsCos(x, result_cos, tolerance));
-  EXPECT_TRUE(mpfr::equalsSin(x, result_sin, tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result_cos, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result_sin, tolerance);
   EXPECT_EQ(FloatBits::One, as_uint32_bits(result_cos));
   EXPECT_EQ(bits, as_uint32_bits(result_sin));
 
   bits = 0x00400000;
   x = as_float(bits);
   __llvm_libc::sincosf(x, &result_sin, &result_cos);
-  EXPECT_TRUE(mpfr::equalsCos(x, result_cos, tolerance));
-  EXPECT_TRUE(mpfr::equalsSin(x, result_sin, tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, result_cos, tolerance);
+  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result_sin, tolerance);
   EXPECT_EQ(FloatBits::One, as_uint32_bits(result_cos));
   EXPECT_EQ(bits, as_uint32_bits(result_sin));
 }
@@ -119,7 +119,7 @@ TEST(SinCosfTest, SDCOMP_26094) {
     float x = as_float(v);
     float sin, cos;
     __llvm_libc::sincosf(x, &sin, &cos);
-    EXPECT_TRUE(mpfr::equalsCos(x, cos, tolerance));
-    EXPECT_TRUE(mpfr::equalsSin(x, sin, tolerance));
+    EXPECT_MPFR_MATCH(mpfr::OP_Cos, x, cos, tolerance);
+    EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, sin, tolerance);
   }
 }
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index e4c6e818b57a39..c0ce0755964c99 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -76,13 +76,13 @@ TEST(SinfTest, InFloatRange) {
     float x = as_float(v);
     if (isnan(x) || isinf(x))
       continue;
-    EXPECT_TRUE(mpfr::equalsSin(x, __llvm_libc::sinf(x), tolerance));
+    ASSERT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance);
   }
 }
 
 TEST(SinfTest, SpecificBitPatterns) {
   float x = as_float(0xc70d39a1);
-  EXPECT_TRUE(mpfr::equalsSin(x, __llvm_libc::sinf(x), tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance);
 }
 
 // For small values, sin(x) is x.
@@ -90,13 +90,13 @@ TEST(SinfTest, SmallValues) {
   uint32_t bits = 0x17800000;
   float x = as_float(bits);
   float result = __llvm_libc::sinf(x);
-  EXPECT_TRUE(mpfr::equalsSin(x, result, tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result, tolerance);
   EXPECT_EQ(bits, as_uint32_bits(result));
 
   bits = 0x00400000;
   x = as_float(bits);
   result = __llvm_libc::sinf(x);
-  EXPECT_TRUE(mpfr::equalsSin(x, result, tolerance));
+  EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, result, tolerance);
   EXPECT_EQ(bits, as_uint32_bits(result));
 }
 
@@ -105,6 +105,6 @@ TEST(SinfTest, SmallValues) {
 TEST(SinfTest, SDCOMP_26094) {
   for (uint32_t v : sdcomp26094Values) {
     float x = as_float(v);
-    EXPECT_TRUE(mpfr::equalsSin(x, __llvm_libc::sinf(x), tolerance));
+    EXPECT_MPFR_MATCH(mpfr::OP_Sin, x, __llvm_libc::sinf(x), tolerance);
   }
 }
diff --git a/libc/utils/CPP/TypeTraits.h b/libc/utils/CPP/TypeTraits.h
index 81e8e68f09d69c..dfc16b00ab745a 100644
--- a/libc/utils/CPP/TypeTraits.h
+++ b/libc/utils/CPP/TypeTraits.h
@@ -46,6 +46,22 @@ template <typename T> struct IsPointerType<T *> : public TrueValue {};
 template <typename T1, typename T2> struct IsSame : public FalseValue {};
 template <typename T> struct IsSame<T, T> : public TrueValue {};
 
+template <typename T> struct TypeIdentity { typedef T Type; };
+
+template <typename T> struct RemoveCV : public TypeIdentity<T> {};
+template <typename T> struct RemoveCV<const T> : public TypeIdentity<T> {};
+template <typename T> struct RemoveCV<volatile T> : public TypeIdentity<T> {};
+template <typename T>
+struct RemoveCV<const volatile T> : public TypeIdentity<T> {};
+
+template <typename T> using RemoveCVType = typename RemoveCV<T>::Type;
+
+template <typename Type> struct IsFloatingPointType {
+  static constexpr bool Value = IsSame<float, RemoveCVType<Type>>::Value ||
+                                IsSame<double, RemoveCVType<Type>>::Value ||
+                                IsSame<long double, RemoveCVType<Type>>::Value;
+};
+
 } // namespace cpp
 } // namespace __llvm_libc
 
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index 8de737485681eb..218d5af9fc2818 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -12,7 +12,8 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     MPFRUtils.cpp
     MPFRUtils.h
   )
-  target_link_libraries(libcMPFRWrapper -lmpfr -lgmp)
+  add_dependencies(libcMPFRWrapper libc.utils.CPP.standalone_cpp LibcUnitTest LLVMSupport)
+  target_link_libraries(libcMPFRWrapper -lmpfr -lgmp LibcUnitTest LLVMSupport)
 else()
   message(WARNING "Math tests using MPFR will be skipped.")
 endif()
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 7bd849934fc779..75ee2adaff5aef 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -8,8 +8,10 @@
 
 #include "MPFRUtils.h"
 
-#include <iostream>
+#include "llvm/ADT/StringRef.h"
+
 #include <mpfr.h>
+#include <string>
 
 namespace __llvm_libc {
 namespace testing {
@@ -25,11 +27,38 @@ class MPFRNumber {
 public:
   MPFRNumber() { mpfr_init2(value, mpfrPrecision); }
 
-  explicit MPFRNumber(float x) {
+  // We use explicit EnableIf specializations to disallow implicit
+  // conversions. Implicit conversions can potentially lead to loss of
+  // precision.
+  template <typename XType,
+            cpp::EnableIfType<cpp::IsSame<float, XType>::Value, int> = 0>
+  explicit MPFRNumber(XType x) {
     mpfr_init2(value, mpfrPrecision);
     mpfr_set_flt(value, x, MPFR_RNDN);
   }
 
+  template <typename XType,
+            cpp::EnableIfType<cpp::IsSame<double, XType>::Value, int> = 0>
+  explicit MPFRNumber(XType x) {
+    mpfr_init2(value, mpfrPrecision);
+    mpfr_set_d(value, x, MPFR_RNDN);
+  }
+
+  template <typename XType,
+            cpp::EnableIfType<cpp::IsFloatingPointType<XType>::Value, int> = 0>
+  MPFRNumber(Operation op, XType rawValue) {
+    mpfr_init2(value, mpfrPrecision);
+    MPFRNumber mpfrInput(rawValue);
+    switch (op) {
+    case OP_Cos:
+      mpfr_cos(value, mpfrInput.value, MPFR_RNDN);
+      break;
+    case OP_Sin:
+      mpfr_sin(value, mpfrInput.value, MPFR_RNDN);
+      break;
+    }
+  }
+
   MPFRNumber(const MPFRNumber &other) {
     mpfr_set(value, other.value, MPFR_RNDN);
   }
@@ -59,38 +88,51 @@ class MPFRNumber {
     return mpfr_lessequal_p(difference.value, tolerance.value);
   }
 
+  std::string str() const {
+    // 200 bytes should be more than sufficient to hold a 100-digit number
+    // plus additional bytes for the decimal point, '-' sign etc.
+    constexpr size_t printBufSize = 200;
+    char buffer[printBufSize];
+    mpfr_snprintf(buffer, printBufSize, "%100.50Rf", value);
+    llvm::StringRef ref(buffer);
+    ref = ref.trim();
+    return ref.str();
+  }
+
   // These functions are useful for debugging.
   float asFloat() const { return mpfr_get_flt(value, MPFR_RNDN); }
   double asDouble() const { return mpfr_get_d(value, MPFR_RNDN); }
   void dump(const char *msg) const { mpfr_printf("%s%.128Rf\n", msg, value); }
+};
 
-public:
-  static MPFRNumber cos(float x) {
-    MPFRNumber result;
-    MPFRNumber mpfrX(x);
-    mpfr_cos(result.value, mpfrX.value, MPFR_RNDN);
-    return result;
-  }
+namespace internal {
+
+template <typename T>
+void MPFRMatcher<T>::explainError(testutils::StreamWrapper &OS) {
+  MPFRNumber mpfrResult(operation, input);
+  MPFRNumber mpfrInput(input);
+  MPFRNumber mpfrMatchValue(matchValue);
+  OS << "Match value not within tolerance value of MPFR result:\n"
+     << "Operation input: " << mpfrInput.str() << '\n'
+     << "    Match value: " << mpfrMatchValue.str() << '\n'
+     << "    MPFR result: " << mpfrResult.str() << '\n';
+}
 
-  static MPFRNumber sin(float x) {
-    MPFRNumber result;
-    MPFRNumber mpfrX(x);
-    mpfr_sin(result.value, mpfrX.value, MPFR_RNDN);
-    return result;
-  }
+template void MPFRMatcher<float>::explainError(testutils::StreamWrapper &);
+template void MPFRMatcher<double>::explainError(testutils::StreamWrapper &);
+
+template <typename T>
+bool compare(Operation op, T input, T libcResult, const Tolerance &t) {
+  MPFRNumber mpfrResult(op, input);
+  MPFRNumber mpfrInput(input);
+  MPFRNumber mpfrLibcResult(libcResult);
+  return mpfrResult.isEqual(mpfrLibcResult, t);
 };
 
-bool equalsCos(float input, float libcOutput, const Tolerance &t) {
-  MPFRNumber mpfrResult = MPFRNumber::cos(input);
-  MPFRNumber libcResult(libcOutput);
-  return mpfrResult.isEqual(libcResult, t);
-}
+template bool compare<float>(Operation, float, float, const Tolerance &);
+template bool compare<double>(Operation, double, double, const Tolerance &);
 
-bool equalsSin(float input, float libcOutput, const Tolerance &t) {
-  MPFRNumber mpfrResult = MPFRNumber::sin(input);
-  MPFRNumber libcResult(libcOutput);
-  return mpfrResult.isEqual(libcResult, t);
-}
+} // namespace internal
 
 } // namespace mpfr
 } // namespace testing
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 9f56ccc61fe636..31afd39b289573 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -9,6 +9,9 @@
 #ifndef LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H
 #define LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H
 
+#include "utils/CPP/TypeTraits.h"
+#include "utils/UnitTest/Test.h"
+
 #include <stdint.h>
 
 namespace __llvm_libc {
@@ -36,16 +39,57 @@ struct Tolerance {
   uint32_t bits;
 };
 
-// Return true if |libcOutput| is within the tolerance |t| of the cos(x)
-// value as evaluated by MPFR.
-bool equalsCos(float x, float libcOutput, const Tolerance &t);
+enum Operation {
+  OP_Cos,
+  OP_Sin,
+};
+
+namespace internal {
+
+template <typename T>
+bool compare(Operation op, T input, T libcOutput, const Tolerance &t);
+
+template <typename T> class MPFRMatcher : public testing::Matcher<T> {
+  static_assert(__llvm_libc::cpp::IsFloatingPointType<T>::Value,
+                "MPFRMatcher can only be used with floating point values.");
+
+  Operation operation;
+  T input;
+  Tolerance tolerance;
+  T matchValue;
+
+public:
+  MPFRMatcher(Operation op, T testInput, Tolerance &t)
+      : operation(op), input(testInput), tolerance(t) {}
 
-// Return true if |libcOutput| is within the tolerance |t| of the sin(x)
-// value as evaluated by MPFR.
-bool equalsSin(float x, float libcOutput, const Tolerance &t);
+  bool match(T libcResult) {
+    matchValue = libcResult;
+    return internal::compare(operation, input, libcResult, tolerance);
+  }
+
+  void explainError(testutils::StreamWrapper &OS) override;
+};
+
+} // namespace internal
+
+template <typename T>
+internal::MPFRMatcher<T> getMPFRMatcher(Operation op, T input, Tolerance t) {
+  static_assert(
+      __llvm_libc::cpp::IsFloatingPointType<T>::Value,
+      "getMPFRMatcher can only be used to match floating point results.");
+  return internal::MPFRMatcher<T>(op, input, t);
+}
 
 } // namespace mpfr
 } // namespace testing
 } // namespace __llvm_libc
 
+#define EXPECT_MPFR_MATCH(op, input, matchValue, tolerance)                    \
+  EXPECT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher(          \
+                              op, input, tolerance))
+
+#define ASSERT_MPFR_MATCH(op, input, matchValue, tolerance)                    \
+  ASSERT_THAT(matchValue, __llvm_libc::testing::mpfr::getMPFRMatcher(          \
+                              op, input, tolerance))
+
 #endif // LLVM_LIBC_UTILS_TESTUTILS_MPFRUTILS_H
diff --git a/libc/utils/testutils/StreamWrapper.cpp b/libc/utils/testutils/StreamWrapper.cpp
index b8a693d767ce79..f6318a99340187 100644
--- a/libc/utils/testutils/StreamWrapper.cpp
+++ b/libc/utils/testutils/StreamWrapper.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <memory>
+#include <string>
 
 namespace __llvm_libc {
 namespace testutils {
@@ -41,6 +42,7 @@ template StreamWrapper &
 template StreamWrapper &
     StreamWrapper::operator<<<unsigned long long>(unsigned long long t);
 template StreamWrapper &StreamWrapper::operator<<<bool>(bool t);
+template StreamWrapper &StreamWrapper::operator<<<std::string>(std::string t);
 
 } // namespace testutils
 } // namespace __llvm_libc
diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.sh.cpp
similarity index 85%
rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.sh.cpp
index 5b01f33bf4b0e0..650537dcb20ce8 100644
--- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cerr.sh.cpp
@@ -10,6 +10,11 @@
 
 // istream cerr;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} %t.exe 2> %t.err
+// RUN: grep -e 'Hello World!' %t.err
+
 #include <iostream>
 #include <cassert>
 
@@ -17,16 +22,15 @@
 
 int main(int, char**)
 {
-#if 0
+
     std::cerr << "Hello World!\n";
-#else
+
 #ifdef _LIBCPP_HAS_NO_STDOUT
     assert(std::cerr.tie() == NULL);
 #else
     assert(std::cerr.tie() == &std::cout);
 #endif
     assert(std::cerr.flags() & std::ios_base::unitbuf);
-#endif  // 0
 
   return 0;
 }
diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.sh.cpp
similarity index 78%
rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.sh.cpp
index 0b3672a4585cdd..386dbbd4721d3c 100644
--- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cin.sh.cpp
@@ -12,6 +12,11 @@
 
 // istream cin;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} echo "123" | %t.exe > %t.out
+// RUN: grep -e 'The number is 123!' %t.out
+
 #include <iostream>
 #include <cassert>
 
@@ -19,18 +24,14 @@
 
 int main(int, char**)
 {
-#if 0
-    std::cout << "Hello World!\n";
     int i;
-    std::cout << "Enter a number: ";
     std::cin >> i;
-    std::cout << "The number is : " << i << '\n';
-#else  // 0
+    std::cout << "The number is " << i << "!";
+
 #ifdef _LIBCPP_HAS_NO_STDOUT
     assert(std::cin.tie() == NULL);
 #else
     assert(std::cin.tie() == &std::cout);
-#endif
 #endif
 
   return 0;
diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.sh.cpp
similarity index 79%
rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.sh.cpp
index 68e37294750167..32e23bf61c34fe 100644
--- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/clog.sh.cpp
@@ -10,17 +10,18 @@
 
 // istream clog;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} %t.exe 2> %t.err
+// RUN: grep -e 'Hello World!' %t.err
+
 #include <iostream>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
-#if 0
     std::clog << "Hello World!\n";
-#else
-    (void)std::clog;
-#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.pass.cpp b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.sh.cpp
similarity index 75%
rename from libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.sh.cpp
index f1d53b773ac128..f4a066b5c50d8c 100644
--- a/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/narrow.stream.objects/cout.sh.cpp
@@ -12,21 +12,18 @@
 
 // istream cout;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} %t.exe > %t.out
+// RUN: grep -e 'Hello World!' %t.out
+
 #include <iostream>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
-#if 0
     std::cout << "Hello World!\n";
-    int i;
-    std::cout << "Enter a number: ";
-    std::cin >> i;
-    std::cout << "The number is : " << i << '\n';
-#else  // 0
-    (void)std::cout;
-#endif
-
-  return 0;
+
+    return 0;
 }
diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.sh.cpp
similarity index 84%
rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.sh.cpp
index 1683c49fbf6d87..30974df3951f61 100644
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr.sh.cpp
@@ -10,6 +10,11 @@
 
 // istream wcerr;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} %t.exe 2> %t.err
+// RUN: grep -e 'Hello World!' %t.err
+
 #include <iostream>
 #include <cassert>
 
@@ -17,16 +22,14 @@
 
 int main(int, char**)
 {
-#if 0
     std::wcerr << L"Hello World!\n";
-#else
+
 #ifdef _LIBCPP_HAS_NO_STDOUT
     assert(std::wcerr.tie() == NULL);
 #else
     assert(std::wcerr.tie() == &std::wcout);
 #endif
     assert(std::wcerr.flags() & std::ios_base::unitbuf);
-#endif  // 0
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.sh.cpp
similarity index 77%
rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.sh.cpp
index c653b2f60678b6..9d24a37233a8c6 100644
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin.sh.cpp
@@ -12,6 +12,11 @@
 
 // istream wcin;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} echo "123" | %t.exe > %t.out
+// RUN: grep -e 'The number is 123!' %t.out
+
 #include <iostream>
 #include <cassert>
 
@@ -19,19 +24,15 @@
 
 int main(int, char**)
 {
-#if 0
-    std::wcout << L"Hello World!\n";
     int i;
-    std::wcout << L"Enter a number: ";
     std::wcin >> i;
-    std::wcout << L"The number is : " << i << L'\n';
-#else  // 0
+    std::wcout << L"The number is " << i << L"!";
+
 #ifdef _LIBCPP_HAS_NO_STDOUT
     assert(std::wcin.tie() == NULL);
 #else
     assert(std::wcin.tie() == &std::wcout);
-#endif
 #endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.sh.cpp
similarity index 79%
rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.sh.cpp
index f396500890d887..d1b126067155cf 100644
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wclog.sh.cpp
@@ -10,17 +10,18 @@
 
 // istream wclog;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} %t.exe 2> %t.err
+// RUN: grep -e 'Hello World!' %t.err
+
 #include <iostream>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
-#if 0
     std::wclog << L"Hello World!\n";
-#else
-    (void)std::wclog;
-#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.pass.cpp b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.sh.cpp
similarity index 80%
rename from libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.pass.cpp
rename to libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.sh.cpp
index b6bd1ef4ea18e8..0a14f898baa213 100644
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout.sh.cpp
@@ -12,17 +12,18 @@
 
 // istream wcout;
 
+// FILE_DEPENDENCIES: %t.exe
+// RUN: %{build}
+// RUN: %{exec} %t.exe > %t.out
+// RUN: grep -e 'Hello World!' %t.out
+
 #include <iostream>
 
 #include "test_macros.h"
 
 int main(int, char**)
 {
-#if 0
     std::wcout << L"Hello World!\n";
-#else
-    (void)std::wcout;
-#endif
 
-  return 0;
+    return 0;
 }
diff --git a/libcxxabi/src/cxa_vector.cpp b/libcxxabi/src/cxa_vector.cpp
index f20e978d36ef31..325bbf22d20117 100644
--- a/libcxxabi/src/cxa_vector.cpp
+++ b/libcxxabi/src/cxa_vector.cpp
@@ -24,9 +24,9 @@
 
 namespace __cxxabiv1 {
 
-#if 0
-#pragma mark --Helper routines and classes --
-#endif
+//
+// Helper routines and classes
+//
 
 namespace {
     inline static size_t __get_element_count ( void *p ) {
@@ -111,9 +111,9 @@ namespace {
     };
 }
 
-#if 0
-#pragma mark --Externally visible routines--
-#endif
+//
+// Externally visible routines
+//
 
 namespace {
 _LIBCXXABI_NORETURN
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index 27b20baceeedee..60cc581f94fe17 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -120,6 +120,8 @@ RelExpr Hexagon::getRelExpr(RelType type, const Symbol &s,
   case R_HEX_B22_PCREL_X:
   case R_HEX_B32_PCREL_X:
   case R_HEX_GD_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL_X:
+  case R_HEX_GD_PLT_B32_PCREL_X:
     return R_PLT_PC;
   case R_HEX_IE_32_6_X:
   case R_HEX_IE_16_X:
@@ -311,16 +313,18 @@ void Hexagon::relocate(uint8_t *loc, const Relocation &rel,
   case R_HEX_B15_PCREL_X:
     or32le(loc, applyMask(0x00df20fe, val & 0x3f));
     break;
-  case R_HEX_GD_PLT_B22_PCREL:
   case R_HEX_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
   case R_HEX_PLT_B22_PCREL:
     checkInt(loc, val, 22, rel);
     or32le(loc, applyMask(0x1ff3ffe, val >> 2));
     break;
   case R_HEX_B22_PCREL_X:
+  case R_HEX_GD_PLT_B22_PCREL_X:
     or32le(loc, applyMask(0x1ff3ffe, val & 0x3f));
     break;
   case R_HEX_B32_PCREL_X:
+  case R_HEX_GD_PLT_B32_PCREL_X:
     or32le(loc, applyMask(0x0fff3fff, val >> 6));
     break;
   case R_HEX_GOTREL_HI16:
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index ff068019158726..eb30166fcc4ca1 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1344,8 +1344,11 @@ static void scanReloc(InputSectionBase &sec, OffsetGetter &getOffset, RelTy *&i,
         addend &= ~0x8000;
       // R_HEX_GD_PLT_B22_PCREL (call a@GDPLT) is transformed into
       // call __tls_get_addr even if the symbol is non-preemptible.
-      if (!(config->emachine == EM_HEXAGON && type == R_HEX_GD_PLT_B22_PCREL))
-        expr = fromPlt(expr);
+      if (!(config->emachine == EM_HEXAGON &&
+           (type == R_HEX_GD_PLT_B22_PCREL ||
+            type == R_HEX_GD_PLT_B22_PCREL_X ||
+            type == R_HEX_GD_PLT_B32_PCREL_X)))
+      expr = fromPlt(expr);
     }
   }
 
diff --git a/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s b/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s
index ba0eee999c806a..ff5e6dbaac710b 100644
--- a/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s
+++ b/lld/test/ELF/hexagon-tls-gd-nonpreemptible.s
@@ -3,6 +3,7 @@
 # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
 # RUN: ld.lld -shared %t.o -o %t.so
 # RUN: llvm-readobj -r %t.so | FileCheck --check-prefix=RELOC %s
+# RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=REL %s
 # RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex %t.so | FileCheck %s
 
 ## Prior to D77021 lld would error "relocation R_HEX_GD_PLT_B22_PCREL cannot refer to absolute symbol".
@@ -17,17 +18,28 @@
 # RELOC-NEXT:   R_HEX_JMP_SLOT __tls_get_addr 0x0
 # RELOC-NEXT: }
 
+# REL:      R_HEX_B32_PCREL_X _GLOBAL_OFFSET_TABLE_ 0x0
+# REL-NEXT: R_HEX_6_PCREL_X _GLOBAL_OFFSET_TABLE_ 0x4
+# REL-NEXT: R_HEX_GD_GOT_32_6_X a 0x0
+# REL-NEXT: R_HEX_GD_GOT_16_X a 0x0
+# REL-NEXT: R_HEX_GD_PLT_B22_PCREL a 0x0
+# REL-NEXT: R_HEX_GD_PLT_B32_PCREL_X a 0x0
+# REL-NEXT: R_HEX_GD_PLT_B22_PCREL_X a 0x4
+
 # CHECK:      { immext(#{{.*}})
 # CHECK-NEXT:   r2 = add(pc,##{{.*}}) }
 # CHECK-NEXT: { immext(#{{.*}})
 # CHECK-NEXT:   r0 = add(r2,##-{{.*}}) }
 # CHECK-NEXT: { call {{.*}} }
+# CHECK-NEXT: {	immext({{.*}})
+# CHECK-NEXT:  	call {{.*}} }
 # CHECK-NEXT: { r0 = memw(r0+#0x0) }
 
 _start:
   r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
   r0 = add(r2,##a@GDGOT)
   call a@GDPLT
+  call ##a@GDPLT
   r0 = memw(r0+#0)
 
 ## a is non-preemptible due to STV_HIDDEN visibility.
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index 3f727e83f12beb..2cc3d47406b7f6 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -2816,8 +2816,10 @@ void CommandInterpreter::IOHandlerInputComplete(IOHandler &io_handler,
 
   case eReturnStatusFailed:
     m_result.IncrementNumberOfErrors();
-    if (io_handler.GetFlags().Test(eHandleCommandFlagStopOnError))
+    if (io_handler.GetFlags().Test(eHandleCommandFlagStopOnError)) {
+      m_result.SetResult(lldb::eCommandInterpreterResultCommandError);
       io_handler.SetIsDone(true);
+    }
     break;
 
   case eReturnStatusQuit:
diff --git a/lldb/test/Shell/Commands/command-source.test b/lldb/test/Shell/Commands/command-source.test
index d8218850c32c13..fa389f2a12889b 100644
--- a/lldb/test/Shell/Commands/command-source.test
+++ b/lldb/test/Shell/Commands/command-source.test
@@ -1,8 +1,8 @@
 # Check that stop command source on error.
 
-# RUN: %lldb -x -b -o "command source -e 1 %s" 2>&1 | FileCheck %s --check-prefix STOP
+# RUN: not %lldb -x -b -o "command source -e 1 %s" 2>&1 | FileCheck %s --check-prefix STOP
 # RUN: %lldb -x -b -o "command source -e 0 %s" 2>&1 | FileCheck %s --check-prefix CONTINUE
-# RUN: %lldb -x -b -o 'settings set interpreter.stop-command-source-on-error true' -o "command source %s" 2>&1 | FileCheck %s --check-prefix STOP
+# RUN: not %lldb -x -b -o 'settings set interpreter.stop-command-source-on-error true' -o "command source %s" 2>&1 | FileCheck %s --check-prefix STOP
 # RUN: %lldb -x -b -o 'settings set interpreter.stop-command-source-on-error false' -o "command source %s" 2>&1 | FileCheck %s --check-prefix CONTINUE
 
 bogus
diff --git a/lldb/test/Shell/Driver/TestProcessAttach.test b/lldb/test/Shell/Driver/TestProcessAttach.test
index 4e24ebb161b6e7..ab75814e21ce09 100644
--- a/lldb/test/Shell/Driver/TestProcessAttach.test
+++ b/lldb/test/Shell/Driver/TestProcessAttach.test
@@ -1,2 +1,2 @@
-# RUN: %lldb -x -b -S %S/Inputs/process_attach_pid.in 2>&1 | FileCheck %s
+# RUN: not %lldb -x -b -S %S/Inputs/process_attach_pid.in 2>&1 | FileCheck %s
 # CHECK: last option requires an argument
diff --git a/lldb/test/Shell/Host/TestCustomShell.test b/lldb/test/Shell/Host/TestCustomShell.test
index fd97b4c2b06e2e..75114c55449341 100644
--- a/lldb/test/Shell/Host/TestCustomShell.test
+++ b/lldb/test/Shell/Host/TestCustomShell.test
@@ -8,7 +8,7 @@
 # XFAIL: system-openbsd
 
 # RUN: %clang_host %S/Inputs/simple.c -g -o %t.out
-# RUN: SHELL=bogus %lldb %t.out -b -o 'run' 2>&1 | FileCheck %s --check-prefix ERROR
+# RUN: SHELL=bogus not %lldb %t.out -b -o 'run' 2>&1 | FileCheck %s --check-prefix ERROR
 # RUN: env -i %lldb %t.out -b -o 'run' 2>&1 | FileCheck %s
 
 # ERROR: error: shell expansion failed
diff --git a/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test b/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test
index 87c0bd41bb05fb..1747ddd669b609 100644
--- a/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test
+++ b/lldb/test/Shell/Quit/TestQuitExitCodeNonInt.test
@@ -1,4 +1,4 @@
 # UNSUPPORTED: system-windows
-# RUN: %lldb -b -s %s 2>&1 | FileCheck %s
+# RUN: not %lldb -b -s %s 2>&1 | FileCheck %s
 q str
 // CHECK: Couldn't parse 'str'
diff --git a/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test b/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test
index a67669451e9928..315adf02af4d10 100644
--- a/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test
+++ b/lldb/test/Shell/Quit/TestQuitExitCodeTooManyArgs.test
@@ -1,4 +1,4 @@
 # UNSUPPORTED: system-windows
-# RUN: %lldb -b -s %s 2>&1 | FileCheck %s
+# RUN: not %lldb -b -s %s 2>&1 | FileCheck %s
 q 1 2
 // CHECK: Too many arguments for 'quit'
diff --git a/lldb/test/Shell/Reproducer/TestDiscard.test b/lldb/test/Shell/Reproducer/TestDiscard.test
index 829aabbe2b03bf..aee56f77c06f70 100644
--- a/lldb/test/Shell/Reproducer/TestDiscard.test
+++ b/lldb/test/Shell/Reproducer/TestDiscard.test
@@ -6,7 +6,7 @@
 # RUN: %clang_host %S/Inputs/simple.c -g -o %t/reproducer.out
 
 # Capture but don't generate the reproducer.
-# RUN: %lldb -x -b -s %S/Inputs/Discard.in --capture --capture-path %t.repro %t/reproducer.out
+# RUN: not %lldb -x -b -s %S/Inputs/Discard.in --capture --capture-path %t.repro %t/reproducer.out
 
 # Make sure the directory doesn't exist.
 # RUN: mkdir %t.repro
diff --git a/lldb/test/Shell/Reproducer/TestDump.test b/lldb/test/Shell/Reproducer/TestDump.test
index 8300a97004bbfc..cf2c89c938b7d0 100644
--- a/lldb/test/Shell/Reproducer/TestDump.test
+++ b/lldb/test/Shell/Reproducer/TestDump.test
@@ -25,9 +25,9 @@
 # RUN: %lldb --replay %t.repro | FileCheck %s --check-prefix FILES
 
 # RUN: rm %t.repro/gdb-remote.yaml
-# RUN: %lldb -b -o 'reproducer dump -p gdb -f %t.repro' 2>&1 | FileCheck %s --check-prefix GDB-ERROR
+# RUN: not %lldb -b -o 'reproducer dump -p gdb -f %t.repro' 2>&1 | FileCheck %s --check-prefix GDB-ERROR
 # GDB-ERROR: error: Unable to create GDB loader.
 
 # RUN: rm %t.repro/command-interpreter.yaml
-# RUN: %lldb -b -o 'reproducer dump -p commands -f %t.repro' 2>&1 | FileCheck %s --check-prefix COMMANDS-ERROR
+# RUN: not %lldb -b -o 'reproducer dump -p commands -f %t.repro' 2>&1 | FileCheck %s --check-prefix COMMANDS-ERROR
 # COMMANDS-ERROR: error: Unable to create command loader.
diff --git a/lldb/test/Shell/Settings/TestSettingsSet.test b/lldb/test/Shell/Settings/TestSettingsSet.test
index 0def3faaadbb28..3006a694a16b2d 100644
--- a/lldb/test/Shell/Settings/TestSettingsSet.test
+++ b/lldb/test/Shell/Settings/TestSettingsSet.test
@@ -1,7 +1,7 @@
 # This tests setting setting values.
 
 # Check that setting an empty value with -f(orce) clears the value.
-# RUN: %lldb -b -s %s 2>&1 | FileCheck %s
+# RUN: not %lldb -b -s %s 2>&1 | FileCheck %s
 
 settings set tab-size 16
 settings show tab-size
diff --git a/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test b/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test
index a53dc2cd6868dd..d734a0940a2d72 100644
--- a/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test
+++ b/lldb/test/Shell/Settings/TestStopCommandSourceOnError.test
@@ -12,13 +12,13 @@
 # RUN: %lldb -b -o 'settings set interpreter.stop-command-source-on-error false' -s %S/Inputs/StopCommandSource.in | FileCheck %s --check-prefix CONTINUE
 
 # FIXME: Should continue
-# RUN: %lldb -b -s %S/Inputs/DontStopCommandSource.in -o 'bogus' -o 'print 111100000 + 11111' | FileCheck %s --check-prefix STOP
+# RUN: not %lldb -b -s %S/Inputs/DontStopCommandSource.in -o 'bogus' -o 'print 111100000 + 11111' | FileCheck %s --check-prefix STOP
 
 # FIXME: Should continue
-# RUN: %lldb -b -o 'settings set interpreter.stop-command-source-on-error false' -o 'bogus' -o 'print 123400000 + 56789'  | FileCheck %s --check-prefix STOP
+# RUN: not %lldb -b -o 'settings set interpreter.stop-command-source-on-error false' -o 'bogus' -o 'print 123400000 + 56789'  | FileCheck %s --check-prefix STOP
 
 # FIXME: Should continue
-# RUN: %lldb -b -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP
+# RUN: not %lldb -b -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP
 
 # FIXME: Should continue
-# RUN: %lldb -b -o 'settings set interpreter.stop-command-source-on-error true' -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP
+# RUN: not %lldb -b -o 'settings set interpreter.stop-command-source-on-error true' -s %S/Inputs/DontStopCommandSource.in | FileCheck %s --check-prefix STOP
diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test b/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test
index f9c02061fc8637..8f2ef7135afc4e 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test
+++ b/lldb/test/Shell/SymbolFile/DWARF/debug-types-missing-signature.test
@@ -14,10 +14,10 @@ LOOKUPE: no type was found matching 'E'
 RUN: %lldb %t -b -o "type lookup EC" | FileCheck --check-prefix=LOOKUPEC %s
 LOOKUPEC: no type was found matching 'EC'
 
-RUN: %lldb %t -b -o "print (E) 1" 2>&1 | FileCheck --check-prefix=PRINTE %s
+RUN: not %lldb %t -b -o "print (E) 1" 2>&1 | FileCheck --check-prefix=PRINTE %s
 PRINTE: use of undeclared identifier 'E'
 
-RUN: %lldb %t -b -o "print (EC) 1" 2>&1 | FileCheck --check-prefix=PRINTEC %s
+RUN: not %lldb %t -b -o "print (EC) 1" 2>&1 | FileCheck --check-prefix=PRINTEC %s
 PRINTEC: use of undeclared identifier 'EC'
 
 RUN: %lldb %t -b -o "target variable a e ec" | FileCheck --check-prefix=VARS %s
diff --git a/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test b/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test
index e748b4e5c73c31..682b0e5332b1c5 100644
--- a/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test
+++ b/lldb/test/Shell/Unwind/thread-step-out-ret-addr-check.test
@@ -5,7 +5,7 @@
 # UNSUPPORTED: system-windows
 
 # RUN: %clang_host %p/Inputs/call-asm.c -x assembler-with-cpp %p/Inputs/thread-step-out-ret-addr-check.s -o %t
-# RUN: %lldb %t -s %s -b 2>&1 | FileCheck %s
+# RUN: not %lldb %t -s %s -b 2>&1 | FileCheck %s
 
 breakpoint set -n nonstandard_stub
 # CHECK: Breakpoint 1: where = {{.*}}`nonstandard_stub
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index ff7ed2ca0544c5..b38423b285590e 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -619,6 +619,12 @@ int Driver::MainLoop() {
         results.GetResult() != lldb::eCommandInterpreterResultInferiorCrash)
       go_interactive = false;
 
+    // When running in batch mode and stopped because of an error, exit with a
+    // non-zero exit status.
+    if (m_option_data.m_batch &&
+        results.GetResult() == lldb::eCommandInterpreterResultCommandError)
+      exit(1);
+
     if (m_option_data.m_batch &&
         results.GetResult() == lldb::eCommandInterpreterResultInferiorCrash &&
         !m_option_data.m_after_crash_commands.empty()) {
@@ -636,6 +642,13 @@ int Driver::MainLoop() {
         if (local_results.GetResult() ==
             lldb::eCommandInterpreterResultQuitRequested)
           go_interactive = false;
+
+        // When running in batch mode and an error occurred while sourcing
+        // the crash commands, exit with a non-zero exit status.
+        if (m_option_data.m_batch &&
+            local_results.GetResult() ==
+                lldb::eCommandInterpreterResultCommandError)
+          exit(1);
       }
     }
     m_debugger.SetAsync(old_async);
diff --git a/lldb/unittests/DataFormatter/StringPrinterTests.cpp b/lldb/unittests/DataFormatter/StringPrinterTests.cpp
index 4b01f5c1dbe2c1..180b13772af53f 100644
--- a/lldb/unittests/DataFormatter/StringPrinterTests.cpp
+++ b/lldb/unittests/DataFormatter/StringPrinterTests.cpp
@@ -74,8 +74,8 @@ TEST(StringPrinterTests, CxxASCII) {
   EXPECT_EQ(fmt("🥑"), QUOTE("🥑"));
 
   // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn).
-  EXPECT_EQ(fmt("\uD55C"), QUOTE("한"));
-  EXPECT_EQ(fmt("\U00010348"), QUOTE("𐍈"));
+  EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
+  EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
 
   // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds
   // that these are not valid utf8 sequences, but that's OK, the raw values
@@ -111,8 +111,8 @@ TEST(StringPrinterTests, CxxUTF8) {
   EXPECT_EQ(fmt("🥑"), QUOTE("🥑"));
 
   // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn).
-  EXPECT_EQ(fmt("\uD55C"), QUOTE("한"));
-  EXPECT_EQ(fmt("\U00010348"), QUOTE("𐍈"));
+  EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
+  EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
 
   // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds
   // that these are not valid utf8 sequences, but that's OK, the raw values
@@ -148,8 +148,8 @@ TEST(StringPrinterTests, SwiftUTF8) {
   EXPECT_EQ(fmt("🥑"), QUOTE("🥑"));
 
   // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn).
-  EXPECT_EQ(fmt("\uD55C"), QUOTE("한"));
-  EXPECT_EQ(fmt("\U00010348"), QUOTE("𐍈"));
+  EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
+  EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
 
   // FIXME: These strings are all rejected, but shouldn't be AFAICT. LLDB finds
   // that these are not valid utf8 sequences, but that's OK, the raw values
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 91ee584fddc4be..dc63a1a27b7c8a 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -6562,11 +6562,27 @@ On exit from a function:
     * FLAT_SCRATCH
     * EXEC
     * GFX6-8: M0
-    * All SGPR and VGPR registers except the clobbered registers of SGPR4-31 and
-      VGPR0-31.
+    * All SGPR registers except the clobbered registers of SGPR4-31.
+    * VGPR40-47
+      VGPR56-63
+      VGPR72-79
+      VGPR88-95
+      VGPR104-111
+      VGPR120-127
+      VGPR136-143
+      VGPR152-159
+      VGPR168-175
+      VGPR184-191
+      VGPR200-207
+      VGPR216-223
+      VGPR232-239
+      VGPR248-255
+        *Except the argument registers, the VGPR cloberred and the preserved
+        registers are intermixed at regular intervals in order to
+        get a better occupancy.*
 
       For the AMDGPU backend, an inter-procedural register allocation (IPRA)
-      optimization may mark some of clobbered SGPR4-31 and VGPR0-31 registers as
+      optimization may mark some of clobbered SGPR and VGPR registers as
       preserved if it can be determined that the called function does not change
       their value.
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 7a819f0aa5ad53..f3e57567b6bd69 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -620,7 +620,7 @@ class TargetTransformInfo {
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                     bool Insert, bool Extract) const;
 
   /// Estimate the overhead of scalarizing an instructions unique
@@ -1261,7 +1261,8 @@ class TargetTransformInfo::Concept {
   virtual bool shouldBuildLookupTables() = 0;
   virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
   virtual bool useColdCCForColdCall(Function &F) = 0;
-  virtual unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  virtual unsigned getScalarizationOverhead(VectorType *Ty,
+                                            const APInt &DemandedElts,
                                             bool Insert, bool Extract) = 0;
   virtual unsigned
   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
@@ -1609,7 +1610,7 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.useColdCCForColdCall(F);
   }
 
-  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                     bool Insert, bool Extract) override {
     return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 6171ff9fbf0d67..529cdbcb20dd0b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -240,7 +240,7 @@ class TargetTransformInfoImplBase {
 
   bool useColdCCForColdCall(Function &F) { return false; }
 
-  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                     bool Insert, bool Extract) {
     return 0;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e885b1158d07db..140e39d26da718 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -552,32 +552,30 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
   /// extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                     bool Insert, bool Extract) {
-    auto *VTy = cast<VectorType>(Ty);
-    assert(DemandedElts.getBitWidth() == VTy->getNumElements() &&
+    assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
            "Vector size mismatch");
 
     unsigned Cost = 0;
 
-    for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
+    for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
       if (!DemandedElts[i])
         continue;
       if (Insert)
         Cost += static_cast<T *>(this)->getVectorInstrCost(
-            Instruction::InsertElement, VTy, i);
+            Instruction::InsertElement, Ty, i);
       if (Extract)
         Cost += static_cast<T *>(this)->getVectorInstrCost(
-            Instruction::ExtractElement, VTy, i);
+            Instruction::ExtractElement, Ty, i);
     }
 
     return Cost;
   }
 
   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
-    auto *VTy = cast<VectorType>(Ty);
-    APInt DemandedElts = APInt::getAllOnesValue(VTy->getNumElements());
+  unsigned getScalarizationOverhead(VectorType *Ty, bool Insert, bool Extract) {
+    APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
     return static_cast<T *>(this)->getScalarizationOverhead(Ty, DemandedElts,
                                                             Insert, Extract);
   }
@@ -591,11 +589,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     SmallPtrSet<const Value*, 4> UniqueOperands;
     for (const Value *A : Args) {
       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
-        Type *VecTy = nullptr;
-        if (A->getType()->isVectorTy()) {
-          VecTy = A->getType();
+        auto *VecTy = dyn_cast<VectorType>(A->getType());
+        if (VecTy) {
           // If A is a vector operand, VF should be 1 or correspond to A.
-          assert((VF == 1 || VF == cast<VectorType>(VecTy)->getNumElements()) &&
+          assert((VF == 1 || VF == VecTy->getNumElements()) &&
                  "Vector argument does not match VF");
         }
         else
@@ -608,17 +605,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return Cost;
   }
 
-  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
+  unsigned getScalarizationOverhead(VectorType *Ty, ArrayRef<const Value *> Args) {
     unsigned Cost = 0;
-    auto *VecVTy = cast<VectorType>(VecTy);
 
-    Cost += getScalarizationOverhead(VecVTy, true, false);
+    Cost += getScalarizationOverhead(Ty, true, false);
     if (!Args.empty())
-      Cost += getOperandsScalarizationOverhead(Args, VecVTy->getNumElements());
+      Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
     else
       // When no information on arguments is provided, we add the cost
       // associated with one argument as a heuristic.
-      Cost += getScalarizationOverhead(VecVTy, false, true);
+      Cost += getScalarizationOverhead(Ty, false, true);
 
     return Cost;
   }
@@ -742,13 +738,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       break;
     }
 
+    auto *SrcVTy = dyn_cast<VectorType>(Src);
+    auto *DstVTy = dyn_cast<VectorType>(Dst);
+
     // If the cast is marked as legal (or promote) then assume low cost.
     if (SrcLT.first == DstLT.first &&
         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
       return SrcLT.first;
 
     // Handle scalar conversions.
-    if (!Src->isVectorTy() && !Dst->isVectorTy()) {
+    if (!SrcVTy && !DstVTy) {
       // Scalar bitcasts are usually free.
       if (Opcode == Instruction::BitCast)
         return 0;
@@ -763,9 +762,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     }
 
     // Check vector-to-vector casts.
-    if (Dst->isVectorTy() && Src->isVectorTy()) {
-      auto *SrcVTy = cast<VectorType>(Src);
-      auto *DstVTy = cast<VectorType>(Dst);
+    if (DstVTy && SrcVTy) {
       // If the cast is between same-sized registers, then the check is simple.
       if (SrcLT.first == DstLT.first &&
           SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
@@ -819,19 +816,18 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
-      return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+      return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
     }
 
     // We already handled vector-to-vector and scalar-to-scalar conversions.
     // This
     // is where we handle bitcast between vectors and scalars. We need to assume
     //  that the conversion is scalarized in one way or another.
-    if (Opcode == Instruction::BitCast)
+    if (Opcode == Instruction::BitCast) {
       // Illegal bitcasts are done by storing and loading from a stack slot.
-      return (Src->isVectorTy() ? getScalarizationOverhead(Src, false, true)
-                                : 0) +
-             (Dst->isVectorTy() ? getScalarizationOverhead(Dst, true, false)
-                                : 0);
+      return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
+             (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
+    }
 
     llvm_unreachable("Unhandled cast");
   }
@@ -923,7 +919,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
         // This is a vector load/store for some illegal type that is scalarized.
         // We must account for the cost of building or decomposing the vector.
-        Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
+        Cost += getScalarizationOverhead(cast<VectorType>(Src),
+                                         Opcode != Instruction::Store,
                                          Opcode == Instruction::Store);
       }
     }
@@ -1118,7 +1115,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       if (RetVF > 1 || VF > 1) {
         ScalarizationCost = 0;
         if (!RetTy->isVoidTy())
-          ScalarizationCost += getScalarizationOverhead(RetTy, true, false);
+          ScalarizationCost +=
+              getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
         ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
       }
 
@@ -1224,21 +1222,19 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       unsigned ScalarizationCost = ScalarizationCostPassed;
       unsigned ScalarCalls = 1;
       Type *ScalarRetTy = RetTy;
-      if (RetTy->isVectorTy()) {
+      if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
         if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
-          ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
-        ScalarCalls =
-            std::max(ScalarCalls, cast<VectorType>(RetTy)->getNumElements());
+          ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
+        ScalarCalls = std::max(ScalarCalls, RetVTy->getNumElements());
         ScalarRetTy = RetTy->getScalarType();
       }
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         Type *Ty = Tys[i];
-        if (Ty->isVectorTy()) {
+        if (auto *VTy = dyn_cast<VectorType>(Ty)) {
           if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
-            ScalarizationCost += getScalarizationOverhead(Ty, false, true);
-          ScalarCalls =
-              std::max(ScalarCalls, cast<VectorType>(Ty)->getNumElements());
+            ScalarizationCost += getScalarizationOverhead(VTy, false, true);
+          ScalarCalls = std::max(ScalarCalls, VTy->getNumElements());
           Ty = Ty->getScalarType();
         }
         ScalarTys.push_back(Ty);
@@ -1588,12 +1584,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     // Else, assume that we need to scalarize this intrinsic. For math builtins
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
-    if (RetTy->isVectorTy()) {
+    if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
       unsigned ScalarizationCost =
           ((ScalarizationCostPassed != std::numeric_limits<unsigned>::max())
                ? ScalarizationCostPassed
-               : getScalarizationOverhead(RetTy, true, false));
-      unsigned ScalarCalls = cast<VectorType>(RetTy)->getNumElements();
+               : getScalarizationOverhead(RetVTy, true, false));
+      unsigned ScalarCalls = RetVTy->getNumElements();
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         Type *Ty = Tys[i];
@@ -1604,14 +1600,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
       unsigned ScalarCost = ConcreteTTI->getIntrinsicInstrCost(
           IID, RetTy->getScalarType(), ScalarTys, FMF, CostKind);
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-        if (Tys[i]->isVectorTy()) {
+        if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
           if (ScalarizationCostPassed == std::numeric_limits<unsigned>::max())
-            ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
-          ScalarCalls =
-              std::max(ScalarCalls, cast<VectorType>(Tys[i])->getNumElements());
+            ScalarizationCost += getScalarizationOverhead(VTy, false, true);
+          ScalarCalls = std::max(ScalarCalls, VTy->getNumElements());
         }
       }
-
       return ScalarCalls * ScalarCost + ScalarizationCost;
     }
 
diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 49276fb1a94d44..8b2c27e7b88820 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -43,6 +43,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -60,6 +61,7 @@ extern cl::opt<bool> SwpEnableCopyToPhi;
 class MachinePipeliner : public MachineFunctionPass {
 public:
   MachineFunction *MF = nullptr;
+  MachineOptimizationRemarkEmitter *ORE = nullptr;
   const MachineLoopInfo *MLI = nullptr;
   const MachineDominatorTree *MDT = nullptr;
   const InstrItineraryData *InstrItins;
@@ -96,6 +98,7 @@ class MachinePipeliner : public MachineFunctionPass {
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
+    AU.addRequired<MachineOptimizationRemarkEmitterPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 743160a26966cb..95b17aa702d089 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -370,8 +370,10 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
   return TTIImpl->useColdCCForColdCall(F);
 }
 
-unsigned TargetTransformInfo::getScalarizationOverhead(
-    Type *Ty, const APInt &DemandedElts, bool Insert, bool Extract) const {
+unsigned
+TargetTransformInfo::getScalarizationOverhead(VectorType *Ty,
+                                              const APInt &DemandedElts,
+                                              bool Insert, bool Extract) const {
   return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
 }
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 3465aaada873bf..ef4b02ca9e3ef6 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -217,6 +217,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
   MDT = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   TII = MF->getSubtarget().getInstrInfo();
   RegClassInfo.runOnMachineFunction(*MF);
 
@@ -248,6 +249,12 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
   setPragmaPipelineOptions(L);
   if (!canPipelineLoop(L)) {
     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
+                                             L.getStartLoc(), L.getHeader())
+             << "Failed to pipeline loop";
+    });
+
     return Changed;
   }
 
@@ -309,11 +316,24 @@ void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
 /// restricted to loops with a single basic block.  Make sure that the
 /// branch in the loop can be analyzed.
 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
-  if (L.getNumBlocks() != 1)
+  if (L.getNumBlocks() != 1) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Not a single basic block: "
+             << ore::NV("NumBlocks", L.getNumBlocks());
+    });
     return false;
+  }
 
-  if (disabledByPragma)
+  if (disabledByPragma) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Disabled by Pragma.";
+    });
     return false;
+  }
 
   // Check if the branch can't be understood because we can't do pipelining
   // if that's the case.
@@ -321,25 +341,37 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
   LI.FBB = nullptr;
   LI.BrCond.clear();
   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
-    LLVM_DEBUG(
-        dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
     NumFailBranch++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The branch can't be understood";
+    });
     return false;
   }
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
   if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
-    LLVM_DEBUG(
-        dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
     NumFailLoop++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The loop structure is not supported";
+    });
     return false;
   }
 
   if (!L.getLoopPreheader()) {
-    LLVM_DEBUG(
-        dbgs() << "Preheader not found, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
     NumFailPreheader++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "No loop preheader found";
+    });
     return false;
   }
 
@@ -457,10 +489,13 @@ void SwingSchedulerDAG::schedule() {
 
   // Can't schedule a loop without a valid MII.
   if (MII == 0) {
-    LLVM_DEBUG(
-        dbgs()
-        << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");
+    LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
     NumFailZeroMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Invalid Minimal Initiation Interval: 0";
+    });
     return;
   }
 
@@ -469,6 +504,14 @@ void SwingSchedulerDAG::schedule() {
     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
                       << ", we don't pipleline large loops\n");
     NumFailLargeMaxMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Minimal Initiation Interval too large: "
+             << ore::NV("MII", (int)MII) << " > "
+             << ore::NV("SwpMaxMii", SwpMaxMii) << "."
+             << "Refer to -pipeliner-max-mii.";
+    });
     return;
   }
 
@@ -511,15 +554,24 @@ void SwingSchedulerDAG::schedule() {
   if (!Scheduled){
     LLVM_DEBUG(dbgs() << "No schedule found, return\n");
     NumFailNoSchedule++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Unable to find schedule";
+    });
     return;
   }
 
   unsigned numStages = Schedule.getMaxStageCount();
   // No need to generate pipeline if there are no overlapped iterations.
   if (numStages == 0) {
-    LLVM_DEBUG(
-        dbgs() << "No overlapped iterations, no need to generate pipeline\n");
+    LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
     NumFailZeroStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "No need to pipeline - no overlapped iterations in schedule.";
+    });
     return;
   }
   // Check that the maximum stage count is less than user-defined limit.
@@ -527,9 +579,23 @@ void SwingSchedulerDAG::schedule() {
     LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
                       << " : too many stages, abort\n");
     NumFailLargeMaxStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Too many stages in schedule: "
+             << ore::NV("numStages", (int)numStages) << " > "
+             << ore::NV("SwpMaxStages", SwpMaxStages)
+             << ". Refer to -pipeliner-max-stages.";
+    });
     return;
   }
 
+  Pass.ORE->emit([&]() {
+    return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
+                                     Loop.getHeader())
+           << "Pipelined succesfully!";
+  });
+
   // Generate the schedule as a ModuloSchedule.
   DenseMap<MachineInstr *, int> Cycles, Stages;
   std::vector<MachineInstr *> OrderedInsts;
@@ -1080,7 +1146,7 @@ unsigned SwingSchedulerDAG::calculateResMII() {
     }
   }
   int Resmii = Resources.size();
-  LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");
+  LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
   // Delete the memory for each of the DFAs that were created earlier.
   for (ResourceManager *RI : Resources) {
     ResourceManager *D = RI;
@@ -2052,9 +2118,16 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
                     << ")\n");
 
-  if (scheduleFound)
+  if (scheduleFound) {
     Schedule.finalizeSchedule(this);
-  else
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Schedule found with Initiation Interval: " << ore::NV("II", II)
+             << ", MaxStageCount: "
+             << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
+    });
+  } else
     Schedule.reset();
 
   return scheduleFound && Schedule.getMaxStageCount() > 0;
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 0a4d5818703b03..775629074f6c4b 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -540,15 +540,9 @@ void native(SmallVectorImpl<char> &Path, Style style) {
       Path = PathHome;
     }
   } else {
-    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
-      if (*PI == '\\') {
-        auto PN = PI + 1;
-        if (PN < PE && *PN == '\\')
-          ++PI; // increment once, the for loop will move over the escaped slash
-        else
-          *PI = '/';
-      }
-    }
+    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI)
+      if (*PI == '\\')
+        *PI = '/';
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 2c8a5c40421054..7c83b6dcb44b94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
   (sequence "VGPR%u", 32, 255)
 >;
 
+def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
+  // The CSRs & scratch-registers are interleaved at a split boundary of 8.
+  (add (sequence "VGPR%u", 40, 47),
+    (sequence "VGPR%u", 56, 63),
+    (sequence "VGPR%u", 72, 79),
+    (sequence "VGPR%u", 88, 95),
+    (sequence "VGPR%u", 104, 111),
+    (sequence "VGPR%u", 120, 127),
+    (sequence "VGPR%u", 136, 143),
+    (sequence "VGPR%u", 152, 159),
+    (sequence "VGPR%u", 168, 175),
+    (sequence "VGPR%u", 184, 191),
+    (sequence "VGPR%u", 200, 207),
+    (sequence "VGPR%u", 216, 223),
+    (sequence "VGPR%u", 232, 239),
+    (sequence "VGPR%u", 248, 255))
+>;
+
 def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
@@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
 >;
 
 def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
 >;
 
 // Calling convention for leaf functions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2e6f021855f05a..a49b1ddbfe9ded 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2525,15 +2525,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
   unsigned Opc = UseMI.getOpcode();
   if (Opc == AMDGPU::COPY) {
-    bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+    Register DstReg = UseMI.getOperand(0).getReg();
+    bool Is16Bit = getOpSize(UseMI, 0) == 2;
+    bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
-    if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
-      if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32))
+    APInt Imm(32, ImmOp->getImm());
+
+    if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
+      Imm = Imm.ashr(16);
+
+    if (RI.isAGPR(*MRI, DstReg)) {
+      if (!isInlineConstant(Imm))
         return false;
       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
     }
+
+    if (Is16Bit) {
+       if (isVGPRCopy)
+         return false; // Do not clobber vgpr_hi16
+
+       if (DstReg.isVirtual() &&
+           UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+         return false;
+
+      UseMI.getOperand(0).setSubReg(0);
+      if (DstReg.isPhysical()) {
+        DstReg = RI.get32BitRegister(DstReg);
+        UseMI.getOperand(0).setReg(DstReg);
+      }
+      assert(UseMI.getOperand(1).getReg().isVirtual());
+    }
+
     UseMI.setDesc(get(NewOpc));
-    UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+    UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
+    UseMI.getOperand(1).setTargetFlags(0);
     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 9fcc5caf7dfdd2..8231a96f5f6b2c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -827,11 +827,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     const MachineOperand &MO = MI.getOperand(OpNo);
     if (MO.isReg()) {
       if (unsigned SubReg = MO.getSubReg()) {
-        assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg(
-                                   MI.getParent()->getParent()->getRegInfo().
-                                     getRegClass(MO.getReg()), SubReg)) >= 32 &&
-               "Sub-dword subregs are not supported");
-        return RI.getNumChannelsFromSubReg(SubReg) * 4;
+        return RI.getSubRegIdxSize(SubReg) / 8;
       }
     }
     return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 1dac45a029b3f8..d6e082d64e7afd 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -807,7 +807,7 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                            CostKind);
     // Return the cost of multiple scalar invocation plus the cost of
     // inserting and extracting the values.
-    return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+    return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
   }
 
   return BaseCost;
@@ -899,7 +899,7 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
   // The scalarization cost should be a lot higher. We use the number of vector
   // elements plus the scalarization overhead.
   unsigned ScalarCost =
-      NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {});
+      NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
 
   if (Alignment < EltSize / 8)
     return ScalarCost;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index b8571476d66ae0..99845ae7ca8452 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -115,7 +115,7 @@ unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
   return (8 * ST.getVectorLength()) / ElemWidth;
 }
 
-unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty,
+unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty,
                                                   const APInt &DemandedElts,
                                                   bool Insert, bool Extract) {
   return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index 4b0625a67ffd50..b2191910a238f9 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -101,7 +101,7 @@ class HexagonTTIImpl : public BasicTTIImplBase<HexagonTTIImpl> {
     return true;
   }
 
-  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                     bool Insert, bool Extract);
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 4bf03da45397e1..9ec7b07fc3f813 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -464,7 +464,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
       return DivInstrCost;
   }
   else if (ST->hasVector()) {
-    unsigned VF = cast<VectorType>(Ty)->getNumElements();
+    auto *VTy = cast<VectorType>(Ty);
+    unsigned VF = VTy->getNumElements();
     unsigned NumVectors = getNumVectorRegs(Ty);
 
     // These vector operations are custom handled, but are still supported
@@ -477,7 +478,7 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConstPow2)
       return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
     if (DivRemConst)
-      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+      return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
       // division/remainder, which will get scalarized and handled with
@@ -500,7 +501,7 @@ int SystemZTTIImpl::getArithmeticInstrCost(
         // inserting and extracting the values.
         unsigned ScalarCost =
             getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
-        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -517,7 +518,7 @@ int SystemZTTIImpl::getArithmeticInstrCost(
 
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
-      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
@@ -724,8 +725,9 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     }
   }
   else if (ST->hasVector()) {
-    assert (Dst->isVectorTy());
-    unsigned VF = cast<VectorType>(Src)->getNumElements();
+    auto *SrcVecTy = cast<VectorType>(Src);
+    auto *DstVecTy = cast<VectorType>(Dst);
+    unsigned VF = SrcVecTy->getNumElements();
     unsigned NumDstVectors = getNumVectorRegs(Dst);
     unsigned NumSrcVectors = getNumVectorRegs(Src);
 
@@ -781,8 +783,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
-      TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
+      TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
+      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -793,7 +795,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
-        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+        return VF /*ldxbr/lexbr*/ +
+               getScalarizationOverhead(DstVecTy, true, false);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -806,7 +809,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(Src, false, true);
+      return VF + getScalarizationOverhead(SrcVecTy, false, true);
     }
   }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index f2f34f5f0bd10e..98f6988266057d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2888,10 +2888,9 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
-unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
+unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
                                               const APInt &DemandedElts,
                                               bool Insert, bool Extract) {
-  auto* VecTy = cast<VectorType>(Ty);
   unsigned Cost = 0;
 
   // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
@@ -2917,7 +2916,7 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
         // 128-bit vector is free.
         // NOTE: This assumes legalization widens vXf32 vectors.
         if (MScalarTy == MVT::f32)
-          for (unsigned i = 0, e = VecTy->getNumElements(); i < e; i += 4)
+          for (unsigned i = 0, e = Ty->getNumElements(); i < e; i += 4)
             if (DemandedElts[i])
               Cost--;
       }
@@ -2933,7 +2932,7 @@ unsigned X86TTIImpl::getScalarizationOverhead(Type *Ty,
       // vector elements, which represents the number of unpacks we'll end up
       // performing.
       unsigned NumElts = LT.second.getVectorNumElements();
-      unsigned Pow2Elts = PowerOf2Ceil(VecTy->getNumElements());
+      unsigned Pow2Elts = PowerOf2Ceil(Ty->getNumElements());
       Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
     }
   }
@@ -2970,7 +2969,7 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
       APInt DemandedElts = APInt::getAllOnesValue(NumElem);
       int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
                                         AddressSpace, CostKind);
-      int SplitCost = getScalarizationOverhead(Src, DemandedElts,
+      int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
                                                Opcode == Instruction::Load,
                                                Opcode == Instruction::Store);
       return NumElem * Cost + SplitCost;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index eabd0f132363c6..ee9f3a67cd3be2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -135,7 +135,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
                          TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
-  unsigned getScalarizationOverhead(Type *Ty, const APInt &DemandedElts,
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                                     bool Insert, bool Extract);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace,
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index c588ac83d2adc5..89eb1159c123c8 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -93,6 +93,13 @@ static cl::opt<bool>
     DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
                                 cl::init(false), cl::Hidden);
 
+// An integer used to limit the cost of inline deferral.  The default negative
+// number tells shouldBeDeferred to only take the secondary cost into account.
+static cl::opt<int>
+    InlineDeferralScale("inline-deferral-scale",
+                        cl::desc("Scale to limit the cost of inline deferral"),
+                        cl::init(-1), cl::Hidden);
+
 namespace {
 
 enum class InlinerFunctionImportStatsOpts {
@@ -338,12 +345,8 @@ shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost,
   bool ApplyLastCallBonus = Caller->hasLocalLinkage() && !Caller->hasOneUse();
   // This bool tracks what happens if we DO inline C into B.
   bool InliningPreventsSomeOuterInline = false;
+  unsigned NumCallerUsers = 0;
   for (User *U : Caller->users()) {
-    // If the caller will not be removed (either because it does not have a
-    // local linkage or because the LastCallToStaticBonus has been already
-    // applied), then we can exit the loop early.
-    if (!ApplyLastCallBonus && TotalSecondaryCost >= IC.getCost())
-      return false;
     CallBase *CS2 = dyn_cast<CallBase>(U);
 
     // If this isn't a call to Caller (it could be some other sort
@@ -369,8 +372,13 @@ shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost,
     if (IC2.getCostDelta() <= CandidateCost) {
       InliningPreventsSomeOuterInline = true;
       TotalSecondaryCost += IC2.getCost();
+      NumCallerUsers++;
     }
   }
+
+  if (!InliningPreventsSomeOuterInline)
+    return false;
+
   // If all outer calls to Caller would get inlined, the cost for the last
   // one is set very low by getInlineCost, in anticipation that Caller will
   // be removed entirely.  We did not account for this above unless there
@@ -378,7 +386,14 @@ shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost,
   if (ApplyLastCallBonus)
     TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
 
-  return InliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost();
+  // If InlineDeferralScale is negative, then ignore the cost of primary
+  // inlining -- IC.getCost() multiplied by the number of callers to Caller.
+  if (InlineDeferralScale < 0)
+    return TotalSecondaryCost < IC.getCost();
+
+  int TotalCost = TotalSecondaryCost + IC.getCost() * NumCallerUsers;
+  int Allowance = IC.getCost() * InlineDeferralScale;
+  return TotalCost < Allowance;
 }
 
 static std::basic_ostream<char> &operator<<(std::basic_ostream<char> &R,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 612f32ec034bae..b139f8520df321 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5702,9 +5702,9 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
-      ScalarCost +=
-          TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
-                                       APInt::getAllOnesValue(VF), true, false);
+      ScalarCost += TTI.getScalarizationOverhead(
+          cast<VectorType>(ToVectorTy(I->getType(), VF)),
+          APInt::getAllOnesValue(VF), true, false);
       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
     }
 
@@ -5720,8 +5720,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
           Worklist.push_back(J);
         else if (needsExtract(J, VF))
           ScalarCost += TTI.getScalarizationOverhead(
-              ToVectorTy(J->getType(), VF), APInt::getAllOnesValue(VF), false,
-              true);
+              cast<VectorType>(ToVectorTy(J->getType(), VF)),
+              APInt::getAllOnesValue(VF), false, true);
       }
 
     // Scale the total scalar cost by block probability.
@@ -6016,8 +6016,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   Type *RetTy = ToVectorTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(RetTy, APInt::getAllOnesValue(VF),
-                                         true, false);
+    Cost += TTI.getScalarizationOverhead(
+        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6222,7 +6222,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
 
     if (ScalarPredicatedBB) {
       // Return cost for branches around scalarized and predicated blocks.
-      Type *Vec_i1Ty =
+      VectorType *Vec_i1Ty =
           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
       return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
                                            false, true) +
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 235efc450e37aa..008d4002dd835e 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -666,6 +666,15 @@ class BoUpSLP {
   ///       may not be necessary.
   bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
 
+  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineCandidate() const;
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -3673,8 +3682,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
   return true;
 }
 
-static bool isLoadCombineCandidate(Value *Root, unsigned NumElts,
-                                   TargetTransformInfo *TTI) {
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+                                       TargetTransformInfo *TTI) {
   // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
   // shift-left-by-constant.
@@ -3683,9 +3692,9 @@ static bool isLoadCombineCandidate(Value *Root, unsigned NumElts,
          match(ZextLoad, m_Shl(m_Value(), m_Constant())))
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
-  // Check if the input is an extended load.
+  // Check if the input is an extended load of the required or/shift expression.
   Value *LoadPtr;
-  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
     return false;
 
   // Require that the total load bit width is a legal integer type.
@@ -3710,7 +3719,20 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
 
   unsigned NumElts = VectorizableTree[0]->Scalars.size();
   Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-  return isLoadCombineCandidate(FirstReduced, NumElts, TTI);
+  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+  // Peek through a final sequence of stores and check if all operations are
+  // likely to be load-combined.
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+    Value *X;
+    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+        !isLoadCombineCandidateImpl(X, NumElts, TTI))
+      return false;
+  }
+  return true;
 }
 
 bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
@@ -5758,6 +5780,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   }
   if (R.isTreeTinyAndNotFullyVectorizable())
     return false;
+  if (R.isLoadCombineCandidate())
+    return false;
 
   R.computeMinimumValueSizes();
 
@@ -6010,6 +6034,8 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       }
       if (R.isTreeTinyAndNotFullyVectorizable())
         continue;
+      if (R.isLoadCombineCandidate())
+        return false;
 
       R.computeMinimumValueSizes();
       int Cost = R.getTreeCost() - UserCost;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 7b6863fb17a5f5..7f6114b1e98521 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -727,9 +727,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_mov_b32 s18, 0
 ; GPRIDX-NEXT:    s_mov_b32 s19, 0x40200000
-; GPRIDX-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GPRIDX-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GPRIDX-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GPRIDX-NEXT:    s_mov_b32 s17, 0x401c0000
 ; GPRIDX-NEXT:    s_mov_b32 s16, s18
 ; GPRIDX-NEXT:    s_mov_b32 s15, 0x40180000
@@ -793,9 +790,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
 ; GPRIDX-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
-; GPRIDX-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; GPRIDX-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GPRIDX-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0)
 ; GPRIDX-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -816,9 +810,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    s_mov_b32 s8, s18
 ; MOVREL-NEXT:    s_mov_b64 s[6:7], 2.0
 ; MOVREL-NEXT:    s_mov_b64 s[4:5], 1.0
-; MOVREL-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; MOVREL-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; MOVREL-NEXT:    buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
 ; MOVREL-NEXT:    v_mov_b32_e32 v34, s19
 ; MOVREL-NEXT:    v_mov_b32_e32 v33, s18
 ; MOVREL-NEXT:    v_mov_b32_e32 v32, s17
@@ -868,10 +859,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[11:14], off
 ; MOVREL-NEXT:    global_store_dwordx4 v[0:1], v[15:18], off
-; MOVREL-NEXT:    buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
-; MOVREL-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; MOVREL-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index e123d80fb95693..9321ac0f4e6350 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -69,12 +69,12 @@ bb1:
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36
-; GCN-DAG: v_writelane_b32 v33, s34,
+; GCN: buffer_store_dword v41, off, s[0:3], s32 offset:36
+; GCN-DAG: v_writelane_b32 v41, s34,
 ; GCN: s_mov_b32 s34, s32
 ; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}}
 ; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s34 offset:32
 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
 
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
@@ -89,10 +89,10 @@ bb1:
 
 ; GCN: v_readlane_b32
 ; GCN-NOT: v_readlane_b32 s32
-; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32
+; GCN-DAG: buffer_load_dword v40, off, s[0:3], s34 offset:32
 ; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
-; GCN: v_readlane_b32 s34, v33,
-; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN: v_readlane_b32 s34, v41,
+; GCN-DAG: buffer_load_dword v41, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GCN: s_setpc_b64
 define void  @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index a9222d7663f521..c1f639fb6f8254 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -762,17 +762,13 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_byval_align16:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
 
 ; GCN: s_getpc_b64
 
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
@@ -784,15 +780,11 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
 ; GCN: s_getpc_b64
 ; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
 ; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index de0086495870cc..79722b090323c2 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -13,15 +13,15 @@ define void @use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_vcc:
-; GCN: v_writelane_b32 v32, s34, 2
-; GCN: v_writelane_b32 v32, s30, 0
-; GCN: v_writelane_b32 v32, s31, 1
+; GCN: v_writelane_b32 v40, s34, 2
+; GCN: v_writelane_b32 v40, s30, 0
+; GCN: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN: v_readlane_b32 s4, v32, 0
-; GCN: v_readlane_b32 s5, v32, 1
-; GCN: v_readlane_b32 s34, v32, 2
+; GCN: v_readlane_b32 s4, v40, 0
+; GCN: v_readlane_b32 s5, v40, 1
+; GCN: v_readlane_b32 s34, v40, 2
 ; GCN: ; NumSgprs: 37
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define void @indirect_use_vcc() #1 {
   call void @use_vcc()
   ret void
@@ -32,7 +32,7 @@ define void @indirect_use_vcc() #1 {
 ; CI: ; NumSgprs: 39
 ; VI-NOBUG: ; NumSgprs: 41
 ; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
   call void @indirect_use_vcc()
   ret void
@@ -50,7 +50,7 @@ define void @use_flat_scratch() #1 {
 ; GCN-LABEL: {{^}}indirect_use_flat_scratch:
 ; CI: ; NumSgprs: 39
 ; VI: ; NumSgprs: 41
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define void @indirect_use_flat_scratch() #1 {
   call void @use_flat_scratch()
   ret void
@@ -61,7 +61,7 @@ define void @indirect_use_flat_scratch() #1 {
 ; CI: ; NumSgprs: 39
 ; VI-NOBUG: ; NumSgprs: 41
 ; VI-BUG: ; NumSgprs: 96
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
   call void @indirect_use_flat_scratch()
   ret void
@@ -76,7 +76,7 @@ define void @use_10_vgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_use_10_vgpr:
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define void @indirect_use_10_vgpr() #0 {
   call void @use_10_vgpr()
   ret void
@@ -84,23 +84,23 @@ define void @indirect_use_10_vgpr() #0 {
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
 ; GCN: is_dynamic_callstack = 0
-; GCN: ; NumVgprs: 33
+; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
   call void @indirect_use_10_vgpr()
   ret void
 }
 
-; GCN-LABEL: {{^}}use_40_vgpr:
-; GCN: ; NumVgprs: 40
-define void @use_40_vgpr() #1 {
-  call void asm sideeffect "", "~{v39}"() #0
+; GCN-LABEL: {{^}}use_50_vgpr:
+; GCN: ; NumVgprs: 50
+define void @use_50_vgpr() #1 {
+  call void asm sideeffect "", "~{v49}"() #0
   ret void
 }
 
-; GCN-LABEL: {{^}}indirect_use_40_vgpr:
-; GCN: ; NumVgprs: 40
-define void @indirect_use_40_vgpr() #0 {
-  call void @use_40_vgpr()
+; GCN-LABEL: {{^}}indirect_use_50_vgpr:
+; GCN: ; NumVgprs: 50
+define void @indirect_use_50_vgpr() #0 {
+  call void @use_50_vgpr()
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index ee77007ef59edb..05e887345637b3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -5,7 +5,6 @@
 declare hidden void @external_void_func_void() #0
 
 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: s_mov_b32 s33, s7
 ; GCN: s_getpc_b64 s[34:35]
 ; GCN-NEXT: s_add_u32 s34, s34,
 ; GCN-NEXT: s_addc_u32 s35, s35,
@@ -24,22 +23,22 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
 ; GCN: buffer_store_dword
-; GCN: v_writelane_b32 v32, s34, 4
-; GCN: v_writelane_b32 v32, s36, 0
-; GCN: v_writelane_b32 v32, s37, 1
-; GCN: v_writelane_b32 v32, s30, 2
-; GCN: v_writelane_b32 v32, s31, 3
+; GCN: v_writelane_b32 v40, s34, 4
+; GCN: v_writelane_b32 v40, s36, 0
+; GCN: v_writelane_b32 v40, s37, 1
+; GCN: v_writelane_b32 v40, s30, 2
+; GCN: v_writelane_b32 v40, s31, 3
 
 ; GCN: s_swappc_b64
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s4, v32, 2
-; GCN-DAG: v_readlane_b32 s5, v32, 3
-; GCN: v_readlane_b32 s37, v32, 1
-; GCN: v_readlane_b32 s36, v32, 0
+; GCN-DAG: v_readlane_b32 s4, v40, 2
+; GCN-DAG: v_readlane_b32 s5, v40, 3
+; GCN: v_readlane_b32 s37, v40, 1
+; GCN: v_readlane_b32 s36, v40, 0
 
-; GCN: v_readlane_b32 s34, v32, 4
+; GCN: v_readlane_b32 s34, v40, 4
 ; GCN: buffer_load_dword
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
@@ -50,16 +49,16 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
 }
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
-; GCN: buffer_store_dword v32
-; GCN: v_writelane_b32 v32, s34, 4
+; GCN: buffer_store_dword v40
+; GCN: v_writelane_b32 v40, s34, 4
 
 ; GCN: s_mov_b32 s34, s32
 ; GCN: s_add_u32 s32, s32, 0x400
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_swappc_b64
 
-; GCN: v_readlane_b32 s34, v32, 4
-; GCN: buffer_load_dword v32,
+; GCN: v_readlane_b32 s34, v40, 4
+; GCN: buffer_load_dword v40,
 define void @test_func_call_external_void_funcx2() #0 {
   call void @external_void_func_void()
   call void @external_void_func_void()
@@ -116,9 +115,9 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
-; GCN: v_mov_b32_e32 v32, v31
+; GCN: v_mov_b32_e32 v40, v31
 ; GCN-NEXT: s_swappc_b64
-; GCN-NEXT: v_mov_b32_e32 v31, v32
+; GCN-NEXT: v_mov_b32_e32 v31, v40
 define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
   %v31 = call i32 asm sideeffect "; def $0", "={v31}"()
   call void @external_void_func_void()
@@ -129,8 +128,6 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace
 ; FIXME: What is the expected behavior for reserved registers here?
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; GCN: s_mov_b32 s33, s9
-; GCN: s_mov_b32 s32, s33
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
@@ -150,14 +147,13 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34:
-; GCN: s_mov_b32 s33, s9
-; GCN-NOT: s34
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
 ; GCN-NOT: s34
 
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
+; GCN: s_mov_b32 s32, s33
 
 ; GCN-NOT: s34
 ; GCN: ;;#ASMSTART
@@ -180,32 +176,31 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32:
-; GCN: s_mov_b32 s33, s9
+; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
 
 ; GCN-NOT: v32
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
-; GCN-NOT: v32
-; GCN-DAG: s_mov_b32 s32, s33
+; GCN: s_mov_b32 s32, s33
+; GCN-NOT: v40
 
 ; GCN: ;;#ASMSTART
-; GCN-NEXT: ; def v32
+; GCN-NEXT: ; def v40
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN: s_swappc_b64 s[30:31], s[4:5]
 
-; GCN-NOT: v32
+; GCN-NOT: v40
 
 ; GCN: ;;#ASMSTART
-; GCN-NEXT: ; use v32
+; GCN-NEXT: ; use v40
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
-  %v32 = call i32 asm sideeffect "; def $0", "={v32}"()
+define amdgpu_kernel void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
+  %v40 = call i32 asm sideeffect "; def $0", "={v40}"()
   call void @external_void_func_void()
-  call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
+  call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
   ret void
 }
 
@@ -234,8 +229,6 @@ define hidden void @void_func_void_clobber_s34() #2 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
-; GCN: s_mov_b32 s33, s7
-
 ; GCN: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
@@ -248,7 +241,6 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
 }
 
 ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34:
-; GCN: s_mov_b32 s33, s7
 ; GCN: s_getpc_b64
 ; GCN-NEXT: s_add_u32
 ; GCN-NEXT: s_addc_u32
@@ -262,12 +254,12 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
 
 ; GCN-LABEL: {{^}}callee_saved_sgpr_func:
 ; GCN-NOT: s40
-; GCN: v_writelane_b32 v32, s40
+; GCN: v_writelane_b32 v40, s40
 ; GCN: s_swappc_b64
 ; GCN-NOT: s40
 ; GCN: ; use s40
 ; GCN-NOT: s40
-; GCN: v_readlane_b32 s40, v32
+; GCN: v_readlane_b32 s40, v40
 ; GCN-NOT: s40
 define void @callee_saved_sgpr_func() #2 {
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
@@ -294,19 +286,19 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
 ; First call preserved VGPR is used so it can't be used for SGPR spills.
 ; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
 ; GCN-NOT: s40
-; GCN: v_writelane_b32 v33, s40
+; GCN: v_writelane_b32 v41, s40
 ; GCN: s_swappc_b64
 ; GCN-NOT: s40
 ; GCN: ; use s40
 ; GCN-NOT: s40
-; GCN: v_readlane_b32 s40, v33
+; GCN: v_readlane_b32 s40, v41
 ; GCN-NOT: s40
 define void @callee_saved_sgpr_vgpr_func() #2 {
   %s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
-  %v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
+  %v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0
   call void @external_void_func_void()
   call void asm sideeffect "; use $0", "s"(i32 %s40) #0
-  call void asm sideeffect "; use $0", "v"(i32 %v32) #0
+  call void asm sideeffect "; use $0", "v"(i32 %v40) #0
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 72423ec4189e5e..7391d7bbdcb0f3 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -61,11 +61,11 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
 ; GCN-NEXT:    s_add_u32 s4, s4, func@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func@rel32@hi+4
 ; GCN-NEXT:    s_mov_b32 s32, s33
-; GCN-NEXT:    v_mov_b32_e32 v32, 0
+; GCN-NEXT:    v_mov_b32_e32 v40, 0
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s34
 ; GCN-NEXT:    v_mov_b32_e32 v1, s35
-; GCN-NEXT:    global_store_dword v[0:1], v32, off
+; GCN-NEXT:    global_store_dword v[0:1], v40, off
 ; GCN-NEXT:    s_endpgm
   call void @func(i32 0)
   store i32 0, i32 addrspace(1)* %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index c42cadbc80c570..0331881f01280a 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -127,8 +127,8 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN: s_swappc_b64
 
-; GCN-DAG: v_readlane_b32 s4, v32, 0
-; GCN-DAG: v_readlane_b32 s5, v32, 1
+; GCN-DAG: v_readlane_b32 s4, v40, 0
+; GCN-DAG: v_readlane_b32 s5, v40, 1
 
 ; GCN: s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]]
@@ -168,6 +168,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
   call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
   call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
   call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
+  call void asm sideeffect "", "~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #0
 
   %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
   %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@@ -207,14 +208,14 @@ define void @spill_only_csr_sgpr() {
 ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s34
 ; GCN-NEXT: s_mov_b32 s34, s32
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill
 ; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:8
 
 ; GCN:	;;#ASMSTART
-; GCN-NEXT: ; clobber v33
+; GCN-NEXT: ; clobber v41
 ; GCN-NEXT: ;;#ASMEND
 
-; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v41, off, s[0:3], s34 ; 4-byte Folded Reload
 ; GCN: s_add_u32 s32, s32, 0x300
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300
 ; GCN-NEXT: s_mov_b32 s34, s4
@@ -223,7 +224,7 @@ define void @spill_only_csr_sgpr() {
 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
-  call void asm sideeffect "; clobber v33", "~{v33}"()
+  call void asm sideeffect "; clobber v41", "~{v41}"()
   ret void
 }
 
@@ -232,7 +233,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; GCN: s_waitcnt
 ; GCN-NEXT: v_writelane_b32 v1, s34, 63
 ; GCN-NEXT: s_mov_b32 s34, s32
-; GCN: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill
 ; GCN-COUNT-63: v_writelane_b32 v1
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:8
 ; GCN: ;;#ASMSTART
@@ -246,7 +247,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 define void @last_lane_vgpr_for_fp_csr() #1 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
-  call void asm sideeffect "; clobber v33", "~{v33}"()
+  call void asm sideeffect "; clobber v41", "~{v41}"()
   call void asm sideeffect "",
     "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
     ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
@@ -264,14 +265,14 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34
 ; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill
 ; GCN-COUNT-64: v_writelane_b32 v1,
 
 ; GCN: buffer_store_dword
 ; GCN: ;;#ASMSTART
 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
 
-; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v41, off, s[0:3], s34 ; 4-byte Folded Reload
 ; GCN: s_add_u32 s32, s32, 0x300
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300
 ; GCN-NEXT: s_mov_b32 s34, [[FP_COPY]]
@@ -280,7 +281,7 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 define void @no_new_vgpr_for_fp_csr() #1 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
-  call void asm sideeffect "; clobber v33", "~{v33}"()
+  call void asm sideeffect "; clobber v41", "~{v41}"()
   call void asm sideeffect "",
     "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
     ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
@@ -347,20 +348,20 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 v32, s34, 2
-; GCN-NEXT: v_writelane_b32 v32, s30, 0
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s34, 2
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-NEXT: s_mov_b32 s34, s32
 
-; GCN-DAG: v_writelane_b32 v32, s31, 1
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN-DAG: buffer_store_dword
 ; GCN: s_add_u32 s32, s32, 0x300{{$}}
 
 ; GCN: ;;#ASMSTART
 
-; GCN: v_readlane_b32 s4, v32, 0
-; GCN-NEXT: v_readlane_b32 s5, v32, 1
+; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}}
-; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
@@ -377,11 +378,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
     ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
     ,~{s30},~{s31}"() #0
 
-  call void asm sideeffect "; clobber nonpreserved VGPRs",
+  call void asm sideeffect "; clobber nonpreserved initial VGPRs",
     "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
     ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
     ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
-    ,~{v30},~{v31}"() #1
+    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1
 
   ret void
 }
@@ -394,19 +395,19 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-NEXT: v_writelane_b32 v32, s34, 2
-; GCN-NEXT: v_writelane_b32 v32, s30, 0
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s34, 2
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-DAG: v_writelane_b32 v32, s31, 1
+; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}}
 ; GCN-DAG: buffer_store_dword
 
 ; GCN: ;;#ASMSTART
 
-; GCN: v_readlane_b32 s4, v32, 0
-; GCN-NEXT: v_readlane_b32 s5, v32, 1
+; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
+; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}}
-; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
@@ -429,7 +430,7 @@ define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval ali
     "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
     ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
     ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
-    ,~{v30},~{v31}"() #1
+    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1
 
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 497ea354fc098b..80a0b7892d10f9 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -144,7 +144,7 @@ define hidden void @use_workgroup_id_yz() #1 {
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN: v_readlane_b32 s4, v32, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_x() #1 {
   call void @use_workgroup_id_x()
   ret void
@@ -152,7 +152,7 @@ define hidden void @func_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
 ; GCN-NOT: s4
-; GCN: v_readlane_b32 s4, v32, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_y() #1 {
   call void @use_workgroup_id_y()
   ret void
@@ -160,7 +160,7 @@ define hidden void @func_indirect_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
 ; GCN-NOT: s4
-; GCN: v_readlane_b32 s4, v32, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_z() #1 {
   call void @use_workgroup_id_z()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
index 601ed9698c6185..dd7ed3bbedf4ba 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
@@ -307,7 +307,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
 ; Argument is in right place already
 ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x:
 ; GCN-NOT: s4
-; GCN: v_readlane_b32 s4, v32, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_x() #1 {
   call void @use_workgroup_id_x()
   ret void
@@ -315,7 +315,7 @@ define hidden void @func_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
 ; GCN-NOT: s4
-; GCN: v_readlane_b32 s4, v32, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_y() #1 {
   call void @use_workgroup_id_y()
   ret void
@@ -323,7 +323,7 @@ define hidden void @func_indirect_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
 ; GCN-NOT: s4
-; GCN: v_readlane_b32 s4, v32, 0
+; GCN: v_readlane_b32 s4, v40, 0
 define hidden void @func_indirect_use_workgroup_id_z() #1 {
   call void @use_workgroup_id_z()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 421d41294a28d5..c6add9d7c9fd4c 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -396,13 +396,11 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
 }
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
-; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
 ; VARABI: v_and_b32_e32 v32, 0x3ff, v32
 ; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 
-; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VARABI-NEXT: s_waitcnt
+; VARABI: s_waitcnt
 ; VARABI-NEXT: s_setpc_b64
 
 ; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
@@ -457,14 +455,12 @@ define void @too_many_args_use_workitem_id_x(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
 ; VARABI: enable_vgpr_workitem_id = 0
 
-; VARABI: s_mov_b32 s33, s7
 ; VARABI: s_mov_b32 s32, s33
 ; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; VARABI: s_swappc_b64
 
 
 ; FIXEDABI: enable_vgpr_workitem_id = 2
-; FIXEDABI: s_mov_b32 s33, s17
 ; FIXEDABI-DAG: s_mov_b32 s32, s33
 ; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
 ; FIXEDABI-DAG:	v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
@@ -516,15 +512,15 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 ; Requires loading and storing to stack slot.
 ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34{{$}}
 
 ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
 
 ; GCN: s_swappc_b64
 
-; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
 ; GCN: s_sub_u32 s32, s32, 0x400{{$}}
+; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN: s_setpc_b64
 define void @too_many_args_call_too_many_args_use_workitem_id_x(
   i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@@ -545,13 +541,11 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
 ; frame[2] = VGPR spill slot
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
-; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; VARABI-NEXT: s_waitcnt
 ; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
 ; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
-; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; VARABI: s_setpc_b64
 
 
@@ -616,8 +610,7 @@ define void @too_many_args_use_workitem_id_x_byval(
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
 ; VARABI: enable_vgpr_workitem_id = 0
-; VARABI-DAG: s_mov_b32 s33, s7
-; VARABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; VARABI: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
 ; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
 ; VARABI: s_add_u32 s32, s33, 0x400{{$}}
@@ -630,9 +623,8 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; VARABI: s_swappc_b64
 
 
-; FIXEDABI: s_mov_b32 s33, s17
-; FIXEDABI-DAG: s_add_u32 s32, s33, 0x400
-; FIXEDABI-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: s_add_u32 s32, s33, 0x400{{$}}
 ; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33 offset:4{{$}}
 
 ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
@@ -703,10 +695,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
   ret void
 }
 
-; Only one stack load should be emitted for all 3 values.
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; VARABI: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
 ; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
 ; VARABI-NOT: buffer_load_dword
@@ -720,9 +709,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
 ; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
 
-; VARABI: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; VARABI-NEXT: s_waitcnt
+; VARABI: s_waitcnt
 ; VARABI-NEXT: s_setpc_b64
 
 
@@ -789,8 +776,6 @@ define void @too_many_args_use_workitem_id_xyz(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
 ; GCN: enable_vgpr_workitem_id = 2
 
-; VARABI-DAG: s_mov_b32 s33, s7
-; FIXEDABI-DAG: s_mov_b32 s33, s17
 ; GCN-DAG: s_mov_b32 s32, s33
 
 ; GCN-DAG:	v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
@@ -831,7 +816,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
 
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
-; GCN: ScratchSize: 8
+; GCN: ScratchSize: 0
 define void @too_many_args_use_workitem_id_x_stack_yz(
   i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
   i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@@ -885,9 +870,6 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
 ; GCN: enable_vgpr_workitem_id = 2
 
-; VARABI: s_mov_b32 s33, s7
-; FIXEDABI: s_mov_b32 s33, s17
-
 ; GCN-NOT: v0
 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
 ; GCN-DAG: v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index e880d25392d5c3..e5e75c38dad127 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -28,23 +28,23 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v32, s34, 2
-; GCN-NEXT:    v_writelane_b32 v32, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v2f32@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_v2f32@rel32@hi+4
-; GCN-NEXT:    v_writelane_b32 v32, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s4, v32, 0
-; GCN-NEXT:    v_readlane_b32 s5, v32, 1
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
-; GCN-NEXT:    v_readlane_b32 s34, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -62,23 +62,23 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v32, s34, 2
-; GCN-NEXT:    v_writelane_b32 v32, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v3f32@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_v3f32@rel32@hi+4
-; GCN-NEXT:    v_writelane_b32 v32, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s4, v32, 0
-; GCN-NEXT:    v_readlane_b32 s5, v32, 1
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
-; GCN-NEXT:    v_readlane_b32 s34, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -96,23 +96,23 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v32, s34, 2
-; GCN-NEXT:    v_writelane_b32 v32, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v4f16@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_v4f16@rel32@hi+4
-; GCN-NEXT:    v_writelane_b32 v32, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s4, v32, 0
-; GCN-NEXT:    v_readlane_b32 s5, v32, 1
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
-; GCN-NEXT:    v_readlane_b32 s34, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -130,24 +130,24 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v32, s34, 2
-; GCN-NEXT:    v_writelane_b32 v32, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s34, 2
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_struct@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, func_struct@rel32@hi+4
-; GCN-NEXT:    v_writelane_b32 v32, s31, 1
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_readlane_b32 s4, v32, 0
-; GCN-NEXT:    v_readlane_b32 s5, v32, 1
+; GCN-NEXT:    v_readlane_b32 s4, v40, 0
+; GCN-NEXT:    v_readlane_b32 s5, v40, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
-; GCN-NEXT:    v_readlane_b32 s34, v32, 2
+; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir b/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir
new file mode 100644
index 00000000000000..458bdcef1a584e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold_16bit_imm.mir
@@ -0,0 +1,257 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs -run-pass peephole-opt -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            fold_simm_16_sub_to_lo
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_lo
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: [[COPY:%[0-9]+]]:sgpr_lo16 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG [[COPY]]
+    %0:sreg_32 = S_MOV_B32 2048
+    %1:sgpr_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_simm_16_sub_to_sub
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_sub
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]]
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.lo16:sreg_32 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_simm_16_sub_to_phys
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_phys
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: $sgpr0 = S_MOV_B32 2048
+    ; GCN: SI_RETURN_TO_EPILOG $sgpr0_lo16
+    %0:sreg_32 = S_MOV_B32 2048
+    $sgpr0_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG $sgpr0_lo16
+
+...
+
+---
+name:            fold_aimm_16_sub_to_sub_2048
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_2048
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: %1.lo16:agpr_32 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG %1
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.lo16:agpr_32 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_aimm_16_sub_to_sub_0
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_0
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec
+    ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]]
+    %0:sreg_32 = S_MOV_B32 0
+    %1.lo16:agpr_32 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_aimm_16_sub_to_phys
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_aimm_16_sub_to_phys
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GCN: $agpr0 = V_ACCVGPR_WRITE_B32 0, implicit $exec
+    ; GCN: SI_RETURN_TO_EPILOG $agpr0_lo16
+    %0:sreg_32 = S_MOV_B32 0
+    $agpr0_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG $agpr0_lo16
+
+...
+
+---
+name:            fold_vimm_16_sub_to_lo
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_vimm_16_sub_to_lo
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: [[COPY:%[0-9]+]]:vgpr_lo16 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG [[COPY]]
+    %0:sreg_32 = S_MOV_B32 2048
+    %1:vgpr_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_vimm_16_sub_to_sub
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_vimm_16_sub_to_sub
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: %1.lo16:vgpr_32 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG %1
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.lo16:vgpr_32 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_vimm_16_sub_to_phys
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_vimm_16_sub_to_phys
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: $vgpr0_lo16 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG $vgpr0_lo16
+    %0:sreg_32 = S_MOV_B32 2048
+    $vgpr0_lo16 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG $vgpr0_lo16
+
+...
+
+---
+name:            fold_vimm_16_lo_to_hi
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_vimm_16_lo_to_hi
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: %1.hi16:vgpr_32 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG %1
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.hi16:vgpr_32 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_vimm_16_hi_to_lo
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_vimm_16_hi_to_lo
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: %1.lo16:vgpr_32 = COPY killed [[S_MOV_B32_]].hi16
+    ; GCN: SI_RETURN_TO_EPILOG %1
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.lo16:vgpr_32 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_simm_16_sub_to_sub_lo_to_hi
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_sub_lo_to_hi
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: %1.hi16:sreg_32 = COPY killed [[S_MOV_B32_]].lo16
+    ; GCN: SI_RETURN_TO_EPILOG %1
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.hi16:sreg_32 = COPY killed %0.lo16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_simm_16_sub_to_sub_hi_to_lo_2048
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_sub_hi_to_lo_2048
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]]
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.lo16:sreg_32 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_simm_16_sub_to_sub_hi_to_lo_shifted_2048
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_simm_16_sub_to_sub_hi_to_lo_shifted_2048
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 134217728
+    ; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: SI_RETURN_TO_EPILOG [[S_MOV_B32_1]]
+    %0:sreg_32 = S_MOV_B32 134217728
+    %1.lo16:sreg_32 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_aimm_16_sub_to_sub_hi_to_lo_2048
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_2048
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2048
+    ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 0, implicit $exec
+    ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]]
+    %0:sreg_32 = S_MOV_B32 2048
+    %1.lo16:agpr_32 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_aimm_16_sub_to_sub_hi_to_lo_shifted_1
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_1
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65536
+    ; GCN: [[V_ACCVGPR_WRITE_B32_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32 1, implicit $exec
+    ; GCN: SI_RETURN_TO_EPILOG [[V_ACCVGPR_WRITE_B32_]]
+    %0:sreg_32 = S_MOV_B32 65536
+    %1.lo16:agpr_32 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
+
+---
+name:            fold_aimm_16_sub_to_sub_hi_to_lo_shifted_2048
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: fold_aimm_16_sub_to_sub_hi_to_lo_shifted_2048
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 134217728
+    ; GCN: %1.lo16:agpr_32 = COPY killed [[S_MOV_B32_]].hi16
+    ; GCN: SI_RETURN_TO_EPILOG %1
+    %0:sreg_32 = S_MOV_B32 134217728
+    %1.lo16:agpr_32 = COPY killed %0.hi16
+    SI_RETURN_TO_EPILOG %1
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
index cc3264af104639..a0a78e96b920a6 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-regmask.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
 define void @csr() #0 {
-  call void asm sideeffect "", "~{v0},~{v36},~{v37}"() #0
+  call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 9722972b4a2ea0..b4925a2e046ba6 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -189,44 +189,44 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v35, s34, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 4
 ; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_add_u32 s32, s32, 0x800
-; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s34 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v35, s36, 0
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s34 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+4
-; GFX9-NEXT:    v_writelane_b32 v35, s37, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v32, v1
-; GFX9-NEXT:    v_mov_b32_e32 v33, v0
-; GFX9-NEXT:    v_writelane_b32 v35, s30, 2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v33, v32
-; GFX9-NEXT:    v_writelane_b32 v35, s31, 3
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xffffff, v32
+; GFX9-NEXT:    v_mov_b32_e32 v40, v1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 2
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 3
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_mad_u32_u24 v32, v33, v32, v34
-; GFX9-NEXT:    v_mov_b32_e32 v0, v32
+; GFX9-NEXT:    v_mad_u32_u24 v40, v41, v40, v42
+; GFX9-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_add_u32_e32 v0, v32, v34
+; GFX9-NEXT:    v_add_u32_e32 v0, v40, v42
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_readlane_b32 s4, v35, 2
-; GFX9-NEXT:    v_readlane_b32 s5, v35, 3
-; GFX9-NEXT:    v_readlane_b32 s37, v35, 1
-; GFX9-NEXT:    v_readlane_b32 s36, v35, 0
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s34 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s4, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s5, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 0
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s34 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_sub_u32 s32, s32, 0x800
-; GFX9-NEXT:    v_readlane_b32 s34, v35, 4
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index fdbe3a25e64e87..562e40bc5c6d5e 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -12,23 +12,23 @@ declare void @external_void_func_i32(i32) #0
 
 ; Spill CSR VGPR used for SGPR spilling
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v32, s34, 2
+; GCN-DAG: v_writelane_b32 v40, s34, 2
 ; GCN-DAG: s_mov_b32 s34, s32
 ; GCN-DAG: s_add_u32 s32, s32, 0x400
-; GCN-DAG: v_writelane_b32 v32, s30, 0
-; GCN-DAG: v_writelane_b32 v32, s31, 1
+; GCN-DAG: v_writelane_b32 v40, s30, 0
+; GCN-DAG: v_writelane_b32 v40, s31, 1
 
 ; GCN: s_swappc_b64
 
-; GCN: v_readlane_b32 s4, v32, 0
-; GCN: v_readlane_b32 s5, v32, 1
+; GCN: v_readlane_b32 s4, v40, 0
+; GCN: v_readlane_b32 s5, v40, 1
 
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400
-; GCN-NEXT: v_readlane_b32 s34, v32, 2
+; GCN-NEXT: v_readlane_b32 s34, v40, 2
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
index bec7969382d0ff..0020e17a0b6fe0 100644
--- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
+++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir
@@ -254,7 +254,7 @@ body: |
 ...
 
 # GCN-LABEL: csr{{$}}
-# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
+# GCN: V_AND_B32_e32 $vgpr37, $vgpr0,
 ---
 name:            csr
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index f887a959cbd28c..bb03589ec2fb4c 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -152,9 +152,6 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 ; FIXME: Why load and store same location for stack args?
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
 
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-
 ; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
 ; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
 
@@ -163,9 +160,6 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 ; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
 ; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
 
-; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
@@ -176,7 +170,7 @@ entry:
 
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28
 ; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
 entry:
@@ -203,15 +197,15 @@ entry:
 ; Have another non-tail in the function
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
 ; GCN: s_mov_b32 s34, s32
 ; GCN-DAG: s_add_u32 s32, s32, 0x400
 
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 v34, s36, 0
-; GCN-DAG: v_writelane_b32 v34, s37, 1
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s34 ; 4-byte Folded Spill
+; GCN-DAG: v_writelane_b32 v42, s36, 0
+; GCN-DAG: v_writelane_b32 v42, s37, 1
 
 ; GCN-DAG: s_getpc_b64 s[4:5]
 ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
@@ -220,11 +214,11 @@ entry:
 
 ; GCN: s_swappc_b64
 
-; GCN-DAG: v_readlane_b32 s36, v34, 0
-; GCN-DAG: v_readlane_b32 s37, v34, 1
+; GCN-DAG: v_readlane_b32 s36, v42, 0
+; GCN-DAG: v_readlane_b32 s37, v42, 1
 
-; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v41, off, s[0:3], s34 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v40, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
 
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
@@ -233,7 +227,7 @@ entry:
 ; GCN: s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT: v_readlane_b32 s34,
 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
 ; GCN-NEXT: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
@@ -248,11 +242,11 @@ entry:
 
 ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
 ; GCN-NOT: s33
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
 
 ; GCN-NOT: s33
 
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
 ; GCN: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
index 69a4d7eac9ea6a..1581482bd020a2 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
@@ -2,17 +2,17 @@
 
 ; GCN-LABEL: {{^}}spill_csr_s5_copy:
 ; GCN: s_or_saveexec_b64
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec
-; GCN: v_writelane_b32 v32, s34, 2
+; GCN: v_writelane_b32 v40, s34, 2
 ; GCN: s_swappc_b64
 
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
 ; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
 
-; GCN: v_readlane_b32 s34, v32, 2
+; GCN: v_readlane_b32 s34, v40, 2
 ; GCN: s_or_saveexec_b64
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN: s_mov_b64 exec
 ; GCN: s_setpc_b64
 define void @spill_csr_s5_copy() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
index be60a34b420891..2fd5a046fd80de 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -1,6 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s
 
+; FIXME: The MUBUF loads in this test output are incorrect, their SOffset
+; should use the frame offset register, not the ABI stack pointer register. We
+; rely on the frame index argument of MUBUF stack accesses to survive until PEI
+; so we can fix up the SOffset to use the correct frame register in
+; eliminateFrameIndex. Some things like LocalStackSlotAllocation can lift the
+; frame index up into something (e.g. `v_add_nc_u32`) that we cannot fold back
+; into the MUBUF instruction, and so we end up emitting an incorrect offset.
+; Fixing this may involve adding stack access pseudos so that we don't have to
+; speculatively refer to the ABI stack pointer register at all.
+
 ; An assert was hit when frame offset register was used to address FrameIndex.
 define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) {
 ; GCN-LABEL: kernel_background_evaluate:
@@ -18,7 +28,7 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[38:39]
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0x400000
 ; GCN-NEXT:    s_add_u32 s32, s33, 0xc0000
-; GCN-NEXT:    v_add_nc_u32_e64 v32, 4, 0x4000
+; GCN-NEXT:    v_add_nc_u32_e64 v40, 4, 0x4000
 ; GCN-NEXT:    ; implicit-def: $vcc_hi
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
@@ -30,8 +40,8 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
 ; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GCN-NEXT:    s_cbranch_execz BB0_2
 ; GCN-NEXT:  ; %bb.1: ; %if.then4.i
-; GCN-NEXT:    buffer_load_dword v0, v32, s[36:39], s32 offen
-; GCN-NEXT:    buffer_load_dword v1, v32, s[36:39], s32 offen offset:4
+; GCN-NEXT:    buffer_load_dword v0, v40, s[36:39], s32 offen
+; GCN-NEXT:    buffer_load_dword v1, v40, s[36:39], s32 offen offset:4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_nc_u32_e32 v0, v1, v0
 ; GCN-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
new file mode 100644
index 00000000000000..6aa5010b3d78f4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -0,0 +1,170 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+
+declare void @extern_func()
+
+define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
+; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
+; preserved across the call and should get 8 scratch registers.
+
+; GFX9-LABEL: non_preserved_vgpr_tuple8:
+; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9: buffer_store_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s34 ; 4-byte Folded Spill
+
+; GFX9: v_mov_b32_e32 v37, v11
+; GFX9-NEXT: v_mov_b32_e32 v38, v10
+; GFX9-NEXT: v_mov_b32_e32 v49, v9
+; GFX9-NEXT: v_writelane_b32 v44, s30, 0
+; GFX9-NEXT: v_mov_b32_e32 v36, v16
+; GFX9-NEXT: v_mov_b32_e32 v35, v15
+; GFX9-NEXT: v_mov_b32_e32 v34, v14
+; GFX9-NEXT: v_mov_b32_e32 v33, v13
+; GFX9-NEXT: v_mov_b32_e32 v32, v12
+
+; GFX9: ;;#ASMSTART
+; GFX9-NEXT: ;;#ASMEND
+
+; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+
+; GFX9: buffer_load_dword v43, off, s[0:3], s34 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload
+; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9: s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: non_preserved_vgpr_tuple8:
+; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX10: buffer_store_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s34 ; 4-byte Folded Spill
+
+; GFX10: v_mov_b32_e32 v36, v16
+; GFX10-NEXT: v_mov_b32_e32 v35, v15
+; GFX10-NEXT: v_mov_b32_e32 v34, v14
+; GFX10-NEXT: v_mov_b32_e32 v33, v13
+; GFX10-NEXT: v_mov_b32_e32 v32, v12
+
+; GFX10: ;;#ASMSTART
+; GFX10-NEXT: ;;#ASMEND
+
+; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
+; GFX10-NEXT: v_nop
+; GFX10-NEXT: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+
+; GFX10: buffer_load_dword v43, off, s[0:3], s34 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload
+
+; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX10: s_setpc_b64 s[4:5]
+main_body:
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
+  call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
+  call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
+  call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  call void @extern_func()
+  ret <4 x float> %v
+}
+
+define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
+; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
+; across the call and should get allcoated to 8 CSRs.
+; Only the lower 5 sub-registers of the tuple are preserved.
+; The upper 3 sub-registers are unused.
+
+; GFX9-LABEL: call_preserved_vgpr_tuple8:
+; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9: buffer_store_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s34 ; 4-byte Folded Spill
+
+; GFX9: v_mov_b32_e32 v44, v16
+; GFX9-NEXT: v_mov_b32_e32 v43, v15
+; GFX9-NEXT: v_mov_b32_e32 v42, v14
+; GFX9-NEXT: v_mov_b32_e32 v41, v13
+; GFX9-NEXT: v_mov_b32_e32 v40, v12
+
+; GFX9: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1
+
+; GFX9: buffer_load_dword v44, off, s[0:3], s34 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Reload
+
+; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9: s_setpc_b64 s[4:5]
+;
+; GFX10-LABEL: call_preserved_vgpr_tuple8:
+; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX10: buffer_store_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s34 ; 4-byte Folded Spill
+
+; GFX10: s_getpc_b64 s[4:5]
+; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
+; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
+; GFX10-NEXT: v_mov_b32_e32 v40, v16
+; GFX10-NEXT: v_mov_b32_e32 v41, v15
+; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1
+; GFX10-NEXT: v_mov_b32_e32 v42, v14
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v43, v13
+; GFX10-NEXT: v_mov_b32_e32 v44, v12
+; GFX10-NEXT: ; implicit-def: $vcc_hi
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1
+
+; GFX10: buffer_load_dword v44, off, s[0:3], s34 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s34 offset:12 ; 4-byte Folded Reload
+; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s34 offset:16 ; 4-byte Folded Reload
+; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX10: s_setpc_b64 s[4:5]
+main_body:
+  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  store <4 x float> %v, <4 x float> addrspace(1)* undef
+  call void @extern_func()
+  %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
+  ret <4 x float> %v1
+}
+
+declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
+
+attributes #0 = { nounwind writeonly }
+attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
index a5d5e7c82d70b8..435c36bdedbcde 100644
--- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
@@ -30,7 +30,7 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; CHECK-LABEL: name: undef_identity_copy
-    ; CHECK: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
+    ; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
     ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
     ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
     ; CHECK: $sgpr4 = COPY $sgpr95
@@ -39,9 +39,9 @@ body:             |
     ; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @bar + 4, target-flags(amdgpu-rel32-hi) @bar + 4, implicit-def dead $scc
     ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
     ; CHECK: $sgpr4 = COPY $sgpr95
-    ; CHECK: $vgpr0 = COPY renamable $vgpr32
-    ; CHECK: $vgpr1 = COPY renamable $vgpr33
-    ; CHECK: $vgpr2 = COPY renamable $vgpr34
+    ; CHECK: $vgpr0 = COPY renamable $vgpr40
+    ; CHECK: $vgpr1 = COPY renamable $vgpr41
+    ; CHECK: $vgpr2 = COPY renamable $vgpr42
     ; CHECK: $vgpr3 = KILL undef renamable $vgpr3
     ; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0
     ; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e3149be899c068..ef2cce1202f1e0 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -1058,30 +1058,30 @@ declare void @external_void_func_void() #1
 
 ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT: v_nop
 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
 
-; GCN-NEXT: v_writelane_b32 v32, s34, 2
+; GCN-NEXT: v_writelane_b32 v40, s34, 2
 ; GCN: s_mov_b32 s34, s32
 ; GFX1064: s_add_u32 s32, s32, 0x400
 ; GFX1032: s_add_u32 s32, s32, 0x200
 
 
-; GCN-DAG: v_writelane_b32 v32, s30, 0
-; GCN-DAG: v_writelane_b32 v32, s31, 1
+; GCN-DAG: v_writelane_b32 v40, s30, 0
+; GCN-DAG: v_writelane_b32 v40, s31, 1
 ; GCN: s_swappc_b64
-; GCN-DAG: v_readlane_b32 s4, v32, 0
-; GCN-DAG: v_readlane_b32 s5, v32, 1
+; GCN-DAG: v_readlane_b32 s4, v40, 0
+; GCN-DAG: v_readlane_b32 s5, v40, 1
 
 
 ; GFX1064: s_sub_u32 s32, s32, 0x400
 ; GFX1032: s_sub_u32 s32, s32, 0x200
-; GCN: v_readlane_b32 s34, v32, 2
+; GCN: v_readlane_b32 s34, v40, 2
 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT: v_nop
 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]
diff --git a/llvm/test/CodeGen/PowerPC/sms-remark.ll b/llvm/test/CodeGen/PowerPC/sms-remark.ll
new file mode 100644
index 00000000000000..647b56fa7fcd3f
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-remark.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:       -verify-machineinstrs -ppc-asm-full-reg-names -mcpu=pwr9 --ppc-enable-pipeliner \
+; RUN:       -pass-remarks-analysis=pipeliner -pass-remarks=pipeliner -o /dev/null 2>&1 \
+; RUN:       | FileCheck %s
+
+@x = dso_local local_unnamed_addr global <{ i32, i32, i32, i32, [1020 x i32] }> <{ i32 1, i32 2, i32 3, i32 4, [1020 x i32] zeroinitializer }>, align 4
+@y = dso_local global [1024 x i32] zeroinitializer, align 4
+
+define dso_local i32* @foo() local_unnamed_addr {
+;CHECK: Schedule found with Initiation Interval
+;CHECK: Pipelined succesfully!
+entry:
+  %.pre = load i32, i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0), align 4
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32* getelementptr inbounds ([1024 x i32], [1024 x i32]* @y, i64 0, i64 0)
+
+for.body:                                         ; preds = %for.body, %entry
+  %0 = phi i32 [ %.pre, %entry ], [ %add.2, %for.body ]
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next.2, %for.body ]
+  %arrayidx2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %1
+  %add = add nsw i32 %mul, %0
+  %arrayidx6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx6, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next
+  %2 = load i32, i32* %arrayidx2.1, align 4
+  %mul.1 = mul nsw i32 %2, %2
+  %add.1 = add nsw i32 %mul.1, %add
+  %arrayidx6.1 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next
+  store i32 %add.1, i32* %arrayidx6.1, align 4
+  %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx2.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* bitcast (<{ i32, i32, i32, i32, [1020 x i32] }>* @x to [1024 x i32]*), i64 0, i64 %indvars.iv.next.1
+  %3 = load i32, i32* %arrayidx2.2, align 4
+  %mul.2 = mul nsw i32 %3, %3
+  %add.2 = add nsw i32 %mul.2, %add.1
+  %arrayidx6.2 = getelementptr inbounds [1024 x i32], [1024 x i32]* @y, i64 0, i64 %indvars.iv.next.1
+  store i32 %add.2, i32* %arrayidx6.2, align 4
+  %indvars.iv.next.2 = add nuw nsw i64 %indvars.iv, 3
+  %exitcond.2 = icmp eq i64 %indvars.iv.next.2, 1024
+  br i1 %exitcond.2, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
index fb206b84fa936f..319d4775c5ebe2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll
@@ -393,34 +393,50 @@ define void @PR39538(i8* %t0, i32* %t1) {
 ; CHECK-NEXT:    [[T63:%.*]] = load i8, i8* [[T62]], align 1
 ; CHECK-NEXT:    [[T68:%.*]] = load i8, i8* [[T67]], align 1
 ; CHECK-NEXT:    [[T73:%.*]] = load i8, i8* [[T72]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> undef, i8 [[T3]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[T21]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i8> [[TMP2]], i8 [[T40]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[T59]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8> undef, i8 [[T7]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i8> [[TMP6]], i8 [[T25]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8> [[TMP7]], i8 [[T44]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i8> [[TMP8]], i8 [[T63]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[TMP9]] to <4 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8> undef, i8 [[T12]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8> [[TMP11]], i8 [[T30]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8> [[TMP12]], i8 [[T49]], i32 2
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i8> [[TMP13]], i8 [[T68]], i32 3
-; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[TMP14]] to <4 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x i8> undef, i8 [[T17]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i8> [[TMP16]], i8 [[T35]], i32 1
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i8> [[TMP17]], i8 [[T54]], i32 2
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> [[TMP18]], i8 [[T73]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = zext <4 x i8> [[TMP19]] to <4 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw <4 x i32> [[TMP15]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT:    [[TMP24:%.*]] = or <4 x i32> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <4 x i32> [[TMP25]], [[TMP20]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i32* [[T1]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP26]], <4 x i32>* [[TMP27]], align 4
+; CHECK-NEXT:    [[T4:%.*]] = zext i8 [[T3]] to i32
+; CHECK-NEXT:    [[T8:%.*]] = zext i8 [[T7]] to i32
+; CHECK-NEXT:    [[T13:%.*]] = zext i8 [[T12]] to i32
+; CHECK-NEXT:    [[T18:%.*]] = zext i8 [[T17]] to i32
+; CHECK-NEXT:    [[T22:%.*]] = zext i8 [[T21]] to i32
+; CHECK-NEXT:    [[T26:%.*]] = zext i8 [[T25]] to i32
+; CHECK-NEXT:    [[T31:%.*]] = zext i8 [[T30]] to i32
+; CHECK-NEXT:    [[T36:%.*]] = zext i8 [[T35]] to i32
+; CHECK-NEXT:    [[T41:%.*]] = zext i8 [[T40]] to i32
+; CHECK-NEXT:    [[T45:%.*]] = zext i8 [[T44]] to i32
+; CHECK-NEXT:    [[T50:%.*]] = zext i8 [[T49]] to i32
+; CHECK-NEXT:    [[T55:%.*]] = zext i8 [[T54]] to i32
+; CHECK-NEXT:    [[T60:%.*]] = zext i8 [[T59]] to i32
+; CHECK-NEXT:    [[T64:%.*]] = zext i8 [[T63]] to i32
+; CHECK-NEXT:    [[T69:%.*]] = zext i8 [[T68]] to i32
+; CHECK-NEXT:    [[T74:%.*]] = zext i8 [[T73]] to i32
+; CHECK-NEXT:    [[T5:%.*]] = shl nuw i32 [[T4]], 24
+; CHECK-NEXT:    [[T23:%.*]] = shl nuw i32 [[T22]], 24
+; CHECK-NEXT:    [[T42:%.*]] = shl nuw i32 [[T41]], 24
+; CHECK-NEXT:    [[T61:%.*]] = shl nuw i32 [[T60]], 24
+; CHECK-NEXT:    [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16
+; CHECK-NEXT:    [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16
+; CHECK-NEXT:    [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16
+; CHECK-NEXT:    [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16
+; CHECK-NEXT:    [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8
+; CHECK-NEXT:    [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8
+; CHECK-NEXT:    [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8
+; CHECK-NEXT:    [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8
+; CHECK-NEXT:    [[T10:%.*]] = or i32 [[T9]], [[T5]]
+; CHECK-NEXT:    [[T15:%.*]] = or i32 [[T10]], [[T14]]
+; CHECK-NEXT:    [[T19:%.*]] = or i32 [[T15]], [[T18]]
+; CHECK-NEXT:    [[T28:%.*]] = or i32 [[T27]], [[T23]]
+; CHECK-NEXT:    [[T33:%.*]] = or i32 [[T28]], [[T32]]
+; CHECK-NEXT:    [[T37:%.*]] = or i32 [[T33]], [[T36]]
+; CHECK-NEXT:    [[T47:%.*]] = or i32 [[T46]], [[T42]]
+; CHECK-NEXT:    [[T52:%.*]] = or i32 [[T47]], [[T51]]
+; CHECK-NEXT:    [[T56:%.*]] = or i32 [[T52]], [[T55]]
+; CHECK-NEXT:    [[T66:%.*]] = or i32 [[T65]], [[T61]]
+; CHECK-NEXT:    [[T71:%.*]] = or i32 [[T66]], [[T70]]
+; CHECK-NEXT:    [[T75:%.*]] = or i32 [[T71]], [[T74]]
+; CHECK-NEXT:    store i32 [[T19]], i32* [[T1]], align 4
+; CHECK-NEXT:    store i32 [[T37]], i32* [[T38]], align 4
+; CHECK-NEXT:    store i32 [[T56]], i32* [[T57]], align 4
+; CHECK-NEXT:    store i32 [[T75]], i32* [[T76]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %t6 = getelementptr inbounds i8, i8* %t0, i64 1
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index b2eddd52e68a7e..a577f1b744bc97 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -1182,9 +1182,10 @@ TEST(Support, NormalizePath) {
   Tests.emplace_back("a", "a", "a");
   Tests.emplace_back("a/b", "a\\b", "a/b");
   Tests.emplace_back("a\\b", "a\\b", "a/b");
-  Tests.emplace_back("a\\\\b", "a\\\\b", "a\\\\b");
+  Tests.emplace_back("a\\\\b", "a\\\\b", "a//b");
   Tests.emplace_back("\\a", "\\a", "/a");
   Tests.emplace_back("a\\", "a\\", "a/");
+  Tests.emplace_back("a\\t", "a\\t", "a/t");
 
   for (auto &T : Tests) {
     SmallString<64> Win(std::get<0>(T));
diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
index 51228d3e8437a8..573f9b7c988f17 100644
--- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
+++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.h
@@ -286,6 +286,7 @@ class DmaWaitOp
   void print(OpAsmPrinter &p);
   LogicalResult fold(ArrayRef<Attribute> cstOperands,
                      SmallVectorImpl<OpFoldResult> &results);
+  LogicalResult verify();
 };
 
 /// Prints dimension and symbol list.
diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
index 2f38a24236e3a5..7a07b6db23fce7 100644
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -38,8 +38,9 @@ LogicalResult loopUnrollFull(AffineForOp forOp);
 
 /// Unrolls this for operation by the specified unroll factor. Returns failure
 /// if the loop cannot be unrolled either due to restrictions or due to invalid
-/// unroll factors.
+/// unroll factors. Requires positive loop bounds and step.
 LogicalResult loopUnrollByFactor(AffineForOp forOp, uint64_t unrollFactor);
+LogicalResult loopUnrollByFactor(loop::ForOp forOp, uint64_t unrollFactor);
 
 /// Unrolls this loop by the specified unroll factor or its trip count,
 /// whichever is lower.
@@ -68,9 +69,10 @@ LogicalResult loopUnrollJamByFactor(AffineForOp forOp,
 LogicalResult loopUnrollJamUpToFactor(AffineForOp forOp,
                                       uint64_t unrollJamFactor);
 
-/// Promotes the loop body of a AffineForOp to its containing block if the
-/// AffineForOp was known to have a single iteration.
+/// Promotes the loop body of a AffineForOp/loop::ForOp to its containing block
+/// if the loop was known to have a single iteration.
 LogicalResult promoteIfSingleIteration(AffineForOp forOp);
+LogicalResult promoteIfSingleIteration(loop::ForOp forOp);
 
 /// Promotes all single iteration AffineForOp's in the Function, i.e., moves
 /// their body into the containing Block.
diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
index 98b52eb8155274..4696dd65fa62a2 100644
--- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
+++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt
@@ -9,10 +9,12 @@ set(SOURCES
 if (MLIR_CUDA_CONVERSIONS_ENABLED)
  list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp")
   set(NVPTX_LIBS
+    MC
     NVPTXCodeGen
     NVPTXDesc
     NVPTXInfo
   )
+
 endif()
 
 add_mlir_conversion_library(MLIRGPUtoCUDATransforms
@@ -24,7 +26,6 @@ add_mlir_conversion_library(MLIRGPUtoCUDATransforms
 
   LINK_COMPONENTS
   Core
-  MC
   ${NVPTX_LIBS}
 
   LINK_LIBS PUBLIC
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index 8ef24e2391524e..972a37d20f97b1 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -1444,49 +1444,82 @@ ParseResult DmaStartOp::parse(OpAsmParser &parser, OperationState &result) {
       parser.resolveOperands(tagIndexInfos, indexType, result.operands))
     return failure();
 
-  auto memrefType0 = types[0].dyn_cast<MemRefType>();
-  if (!memrefType0)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected source to be of memref type");
-
-  auto memrefType1 = types[1].dyn_cast<MemRefType>();
-  if (!memrefType1)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected destination to be of memref type");
-
-  auto memrefType2 = types[2].dyn_cast<MemRefType>();
-  if (!memrefType2)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected tag to be of memref type");
-
   if (isStrided) {
     if (parser.resolveOperands(strideInfo, indexType, result.operands))
       return failure();
   }
 
-  // Check that source/destination index list size matches associated rank.
-  if (static_cast<int64_t>(srcIndexInfos.size()) != memrefType0.getRank() ||
-      static_cast<int64_t>(dstIndexInfos.size()) != memrefType1.getRank())
-    return parser.emitError(parser.getNameLoc(),
-                            "memref rank not equal to indices count");
-  if (static_cast<int64_t>(tagIndexInfos.size()) != memrefType2.getRank())
-    return parser.emitError(parser.getNameLoc(),
-                            "tag memref rank not equal to indices count");
 
   return success();
 }
 
 LogicalResult DmaStartOp::verify() {
+  unsigned numOperands = getNumOperands();
+
+  // Mandatory non-variadic operands are: src memref, dst memref, tag memref and
+  // the number of elements.
+  if (numOperands < 4)
+    return emitOpError("expected at least 4 operands");
+
+  // Check types of operands. The order of these calls is important: the later
+  // calls rely on some type properties to compute the operand position.
+  // 1. Source memref.
+  if (!getSrcMemRef().getType().isa<MemRefType>())
+    return emitOpError("expected source to be of memref type");
+  if (numOperands < getSrcMemRefRank() + 4)
+    return emitOpError() << "expected at least " << getSrcMemRefRank() + 4
+                         << " operands";
+  if (!getSrcIndices().empty() &&
+      !llvm::all_of(getSrcIndices().getTypes(),
+                    [](Type t) { return t.isIndex(); }))
+    return emitOpError("expected source indices to be of index type");
+
+  // 2. Destination memref.
+  if (!getDstMemRef().getType().isa<MemRefType>())
+    return emitOpError("expected destination to be of memref type");
+  unsigned numExpectedOperands = getSrcMemRefRank() + getDstMemRefRank() + 4;
+  if (numOperands < numExpectedOperands)
+    return emitOpError() << "expected at least " << numExpectedOperands
+                         << " operands";
+  if (!getDstIndices().empty() &&
+      !llvm::all_of(getDstIndices().getTypes(),
+                    [](Type t) { return t.isIndex(); }))
+    return emitOpError("expected destination indices to be of index type");
+
+  // 3. Number of elements.
+  if (!getNumElements().getType().isIndex())
+    return emitOpError("expected num elements to be of index type");
+
+  // 4. Tag memref.
+  if (!getTagMemRef().getType().isa<MemRefType>())
+    return emitOpError("expected tag to be of memref type");
+  numExpectedOperands += getTagMemRefRank();
+  if (numOperands < numExpectedOperands)
+    return emitOpError() << "expected at least " << numExpectedOperands
+                         << " operands";
+  if (!getTagIndices().empty() &&
+      !llvm::all_of(getTagIndices().getTypes(),
+                    [](Type t) { return t.isIndex(); }))
+    return emitOpError("expected tag indices to be of index type");
+
   // DMAs from different memory spaces supported.
   if (getSrcMemorySpace() == getDstMemorySpace())
     return emitOpError("DMA should be between different memory spaces");
 
-  if (getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() +
-                              getDstMemRefRank() + 3 + 1 &&
-      getNumOperands() != getTagMemRefRank() + getSrcMemRefRank() +
-                              getDstMemRefRank() + 3 + 1 + 2) {
+  // Optional stride-related operands must be either both present or both
+  // absent.
+  if (numOperands != numExpectedOperands &&
+      numOperands != numExpectedOperands + 2)
     return emitOpError("incorrect number of operands");
+
+  // 5. Strides.
+  if (isStrided()) {
+    if (!getStride().getType().isIndex() ||
+        !getNumElementsPerStride().getType().isIndex())
+      return emitOpError(
+          "expected stride and num elements per stride to be of type index");
   }
+
   return success();
 }
 
@@ -1536,15 +1569,6 @@ ParseResult DmaWaitOp::parse(OpAsmParser &parser, OperationState &result) {
       parser.resolveOperand(numElementsInfo, indexType, result.operands))
     return failure();
 
-  auto memrefType = type.dyn_cast<MemRefType>();
-  if (!memrefType)
-    return parser.emitError(parser.getNameLoc(),
-                            "expected tag to be of memref type");
-
-  if (static_cast<int64_t>(tagIndexInfos.size()) != memrefType.getRank())
-    return parser.emitError(parser.getNameLoc(),
-                            "tag memref rank not equal to indices count");
-
   return success();
 }
 
@@ -1554,6 +1578,32 @@ LogicalResult DmaWaitOp::fold(ArrayRef<Attribute> cstOperands,
   return foldMemRefCast(*this);
 }
 
+LogicalResult DmaWaitOp::verify() {
+  // Mandatory non-variadic operands are tag and the number of elements.
+  if (getNumOperands() < 2)
+    return emitOpError() << "expected at least 2 operands";
+
+  // Check types of operands. The order of these calls is important: the later
+  // calls rely on some type properties to compute the operand position.
+  if (!getTagMemRef().getType().isa<MemRefType>())
+    return emitOpError() << "expected tag to be of memref type";
+
+  if (getNumOperands() != 2 + getTagMemRefRank())
+    return emitOpError() << "expected " << 2 + getTagMemRefRank()
+                         << " operands";
+
+  if (!getTagIndices().empty() &&
+      !llvm::all_of(getTagIndices().getTypes(),
+                    [](Type t) { return t.isIndex(); }))
+    return emitOpError() << "expected tag indices to be of index type";
+
+  if (!getNumElements().getType().isIndex())
+    return emitOpError()
+           << "expected the number of elements to be of index type";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ExtractElementOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
index 4b0cd6c8eb1da1..35581eb2a39250 100644
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -24,6 +24,7 @@
 #include "mlir/IR/Function.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/MathExtras.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "mlir/Transforms/Utils.h"
 #include "llvm/ADT/DenseMap.h"
@@ -118,6 +119,34 @@ static void getCleanupLoopLowerBound(AffineForOp forOp, unsigned unrollFactor,
     lb.erase();
 }
 
+// Build the IR that performs ceil division of a positive value by a constant:
+//    ceildiv(a, B) = divis(a + (B-1), B)
+// where divis is rounding-to-zero division.
+static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
+                             int64_t divisor) {
+  assert(divisor > 0 && "expected positive divisor");
+  assert(dividend.getType().isIndex() && "expected index-typed value");
+
+  Value divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
+  Value divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
+  Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
+  return builder.create<SignedDivIOp>(loc, sum, divisorCst);
+}
+
+// Build the IR that performs ceil division of a positive value by another
+// positive value:
+//    ceildiv(a, b) = divis(a + (b - 1), b)
+// where divis is rounding-to-zero division.
+static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
+                             Value divisor) {
+  assert(dividend.getType().isIndex() && "expected index-typed value");
+
+  Value cstOne = builder.create<ConstantIndexOp>(loc, 1);
+  Value divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
+  Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
+  return builder.create<SignedDivIOp>(loc, sum, divisor);
+}
+
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// was known to have a single iteration.
 // TODO(bondhugula): extend this for arbitrary affine bounds.
@@ -161,6 +190,35 @@ LogicalResult mlir::promoteIfSingleIteration(AffineForOp forOp) {
   return success();
 }
 
+/// Promotes the loop body of a forOp to its containing block if the forOp
+/// it can be determined that the loop has a single iteration.
+LogicalResult mlir::promoteIfSingleIteration(loop::ForOp forOp) {
+  auto lbCstOp =
+      dyn_cast_or_null<ConstantIndexOp>(forOp.lowerBound().getDefiningOp());
+  auto ubCstOp =
+      dyn_cast_or_null<ConstantIndexOp>(forOp.upperBound().getDefiningOp());
+  auto stepCstOp =
+      dyn_cast_or_null<ConstantIndexOp>(forOp.step().getDefiningOp());
+  if (!lbCstOp || !ubCstOp || !stepCstOp || lbCstOp.getValue() < 0 ||
+      ubCstOp.getValue() < 0 || stepCstOp.getValue() < 0)
+    return failure();
+  int64_t tripCount = mlir::ceilDiv(ubCstOp.getValue() - lbCstOp.getValue(),
+                                    stepCstOp.getValue());
+  if (tripCount != 1)
+    return failure();
+  auto iv = forOp.getInductionVar();
+  iv.replaceAllUsesWith(lbCstOp);
+
+  // Move the loop body operations, except for its terminator, to the loop's
+  // containing block.
+  auto *parentBlock = forOp.getOperation()->getBlock();
+  forOp.getBody()->back().erase();
+  parentBlock->getOperations().splice(Block::iterator(forOp),
+                                      forOp.getBody()->getOperations());
+  forOp.erase();
+  return success();
+}
+
 /// Promotes all single iteration 'for' ops in `f`, i.e., moves
 /// their body into the containing Block.
 void mlir::promoteSingleIterationLoops(FuncOp f) {
@@ -416,6 +474,37 @@ LogicalResult mlir::loopUnrollUpToFactor(AffineForOp forOp,
   return loopUnrollByFactor(forOp, unrollFactor);
 }
 
+// Generates unrolled copies of AffineForOp or loop::ForOp 'loopBodyBlock', with
+// associated 'forOpIV' by 'unrollFactor', calling 'ivRemapFn' to remap
+// 'forOpIV' for each unrolled body.
+static void generateUnrolledLoop(
+    Block *loopBodyBlock, Value forOpIV, uint64_t unrollFactor,
+    function_ref<Value(unsigned, Value, OpBuilder)> ivRemapFn) {
+  // Builder to insert unrolled bodies just before the terminator of the body of
+  // 'forOp'.
+  auto builder = OpBuilder::atBlockTerminator(loopBodyBlock);
+
+  // Keep a pointer to the last non-terminator operation in the original block
+  // so that we know what to clone (since we are doing this in-place).
+  Block::iterator srcBlockEnd = std::prev(loopBodyBlock->end(), 2);
+
+  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
+  for (unsigned i = 1; i < unrollFactor; i++) {
+    BlockAndValueMapping operandMap;
+
+    // If the induction variable is used, create a remapping to the value for
+    // this unrolled instance.
+    if (!forOpIV.use_empty()) {
+      Value ivUnroll = ivRemapFn(i, forOpIV, builder);
+      operandMap.map(forOpIV, ivUnroll);
+    }
+
+    // Clone the original body of 'forOp'.
+    for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++)
+      builder.clone(*it, operandMap);
+  }
+}
+
 /// Unrolls this loop by the specified factor. Returns success if the loop
 /// is successfully unrolled.
 LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
@@ -467,38 +556,114 @@ LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp,
   // Scale the step of loop being unrolled by unroll factor.
   int64_t step = forOp.getStep();
   forOp.setStep(step * unrollFactor);
+  generateUnrolledLoop(forOp.getBody(), forOp.getInductionVar(), unrollFactor,
+                       [&](unsigned i, Value iv, OpBuilder b) {
+                         // iv' = iv + i * step
+                         auto d0 = b.getAffineDimExpr(0);
+                         auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
+                         return b.create<AffineApplyOp>(forOp.getLoc(), bumpMap,
+                                                        iv);
+                       });
 
-  // Builder to insert unrolled bodies just before the terminator of the body of
-  // 'forOp'.
-  auto builder = OpBuilder::atBlockTerminator(forOp.getBody());
+  // Promote the loop body up if this has turned into a single iteration loop.
+  promoteIfSingleIteration(forOp);
+  return success();
+}
 
-  // Keep a pointer to the last non-terminator operation in the original block
-  // so that we know what to clone (since we are doing this in-place).
-  Block::iterator srcBlockEnd = std::prev(forOp.getBody()->end(), 2);
+/// Unrolls 'forOp' by 'unrollFactor', returns success if the loop is unrolled.
+LogicalResult mlir::loopUnrollByFactor(loop::ForOp forOp,
+                                       uint64_t unrollFactor) {
+  assert(unrollFactor > 0 && "expected positive unroll factor");
+  if (unrollFactor == 1)
+    return promoteIfSingleIteration(forOp);
 
-  // Unroll the contents of 'forOp' (append unrollFactor - 1 additional copies).
-  auto forOpIV = forOp.getInductionVar();
-  for (unsigned i = 1; i < unrollFactor; i++) {
-    BlockAndValueMapping operandMap;
+  // Return if the loop body is empty.
+  if (llvm::hasSingleElement(forOp.getBody()->getOperations()))
+    return success();
 
-    // If the induction variable is used, create a remapping to the value for
-    // this unrolled instance.
-    if (!forOpIV.use_empty()) {
-      // iv' = iv + 1/2/3...unrollFactor-1;
-      auto d0 = builder.getAffineDimExpr(0);
-      auto bumpMap = AffineMap::get(1, 0, d0 + i * step);
-      auto ivUnroll =
-          builder.create<AffineApplyOp>(forOp.getLoc(), bumpMap, forOpIV);
-      operandMap.map(forOpIV, ivUnroll);
-    }
+  // Compute tripCount = ceilDiv((upperBound - lowerBound), step) and populate
+  // 'upperBoundUnrolled' and 'stepUnrolled' for static and dynamic cases.
+  OpBuilder boundsBuilder(forOp);
+  auto loc = forOp.getLoc();
+  auto step = forOp.step();
+  Value upperBoundUnrolled;
+  Value stepUnrolled;
+  bool generateEpilogueLoop = true;
+
+  auto lbCstOp =
+      dyn_cast_or_null<ConstantIndexOp>(forOp.lowerBound().getDefiningOp());
+  auto ubCstOp =
+      dyn_cast_or_null<ConstantIndexOp>(forOp.upperBound().getDefiningOp());
+  auto stepCstOp =
+      dyn_cast_or_null<ConstantIndexOp>(forOp.step().getDefiningOp());
+  if (lbCstOp && ubCstOp && stepCstOp) {
+    // Constant loop bounds computation.
+    int64_t lbCst = lbCstOp.getValue();
+    int64_t ubCst = ubCstOp.getValue();
+    int64_t stepCst = stepCstOp.getValue();
+    assert(lbCst >= 0 && ubCst >= 0 && stepCst >= 0 &&
+           "expected positive loop bounds and step");
+    int64_t tripCount = mlir::ceilDiv(ubCst - lbCst, stepCst);
+    int64_t tripCountEvenMultiple = tripCount - (tripCount % unrollFactor);
+    int64_t upperBoundUnrolledCst = lbCst + tripCountEvenMultiple * stepCst;
+    assert(upperBoundUnrolledCst <= ubCst);
+    int64_t stepUnrolledCst = stepCst * unrollFactor;
+
+    // Create constant for 'upperBoundUnrolled' and set epilogue loop flag.
+    generateEpilogueLoop = upperBoundUnrolledCst < ubCst;
+    if (generateEpilogueLoop)
+      upperBoundUnrolled =
+          boundsBuilder.create<ConstantIndexOp>(loc, upperBoundUnrolledCst);
+    else
+      upperBoundUnrolled = ubCstOp;
+
+    // Create constant for 'stepUnrolled'.
+    stepUnrolled =
+        stepCst == stepUnrolledCst
+            ? step
+            : boundsBuilder.create<ConstantIndexOp>(loc, stepUnrolledCst);
+  } else {
+    // Dynamic loop bounds computation.
+    // TODO(andydavis) Add dynamic asserts for negative lb/ub/step, or
+    // consider using ceilDiv from AffineApplyExpander.
+    auto lowerBound = forOp.lowerBound();
+    auto upperBound = forOp.upperBound();
+    Value diff = boundsBuilder.create<SubIOp>(loc, upperBound, lowerBound);
+    Value tripCount = ceilDivPositive(boundsBuilder, loc, diff, step);
+    Value unrollFactorCst =
+        boundsBuilder.create<ConstantIndexOp>(loc, unrollFactor);
+    Value tripCountRem =
+        boundsBuilder.create<SignedRemIOp>(loc, tripCount, unrollFactorCst);
+    // Compute tripCountEvenMultiple = tripCount - (tripCount % unrollFactor)
+    Value tripCountEvenMultiple =
+        boundsBuilder.create<SubIOp>(loc, tripCount, tripCountRem);
+    // Compute upperBoundUnrolled = lowerBound + tripCountEvenMultiple * step
+    upperBoundUnrolled = boundsBuilder.create<AddIOp>(
+        loc, lowerBound,
+        boundsBuilder.create<MulIOp>(loc, tripCountEvenMultiple, step));
+    // Scale 'step' by 'unrollFactor'.
+    stepUnrolled = boundsBuilder.create<MulIOp>(loc, step, unrollFactorCst);
+  }
 
-    // Clone the original body of 'forOp'.
-    for (auto it = forOp.getBody()->begin(); it != std::next(srcBlockEnd);
-         it++) {
-      builder.clone(*it, operandMap);
-    }
+  // Create epilogue clean up loop starting at 'upperBoundUnrolled'.
+  if (generateEpilogueLoop) {
+    OpBuilder epilogueBuilder(forOp.getOperation()->getBlock(),
+                              std::next(Block::iterator(forOp)));
+    auto epilogueForOp = cast<loop::ForOp>(epilogueBuilder.clone(*forOp));
+    epilogueForOp.setLowerBound(upperBoundUnrolled);
+    promoteIfSingleIteration(epilogueForOp);
   }
 
+  // Create unrolled loop.
+  forOp.setUpperBound(upperBoundUnrolled);
+  forOp.setStep(stepUnrolled);
+  generateUnrolledLoop(forOp.getBody(), forOp.getInductionVar(), unrollFactor,
+                       [&](unsigned i, Value iv, OpBuilder b) {
+                         // iv' = iv + step * i;
+                         auto stride = b.create<MulIOp>(
+                             loc, step, b.create<ConstantIndexOp>(loc, i));
+                         return b.create<AddIOp>(loc, iv, stride);
+                       });
   // Promote the loop body up if this has turned into a single iteration loop.
   promoteIfSingleIteration(forOp);
   return success();
@@ -1032,34 +1197,6 @@ Loops mlir::tilePerfectlyNested(loop::ForOp rootForOp, ArrayRef<Value> sizes) {
   return ::tile(forOps, sizes, forOps.back());
 }
 
-// Build the IR that performs ceil division of a positive value by a constant:
-//    ceildiv(a, B) = divis(a + (B-1), B)
-// where divis is rounding-to-zero division.
-static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
-                             int64_t divisor) {
-  assert(divisor > 0 && "expected positive divisor");
-  assert(dividend.getType().isIndex() && "expected index-typed value");
-
-  Value divisorMinusOneCst = builder.create<ConstantIndexOp>(loc, divisor - 1);
-  Value divisorCst = builder.create<ConstantIndexOp>(loc, divisor);
-  Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOneCst);
-  return builder.create<SignedDivIOp>(loc, sum, divisorCst);
-}
-
-// Build the IR that performs ceil division of a positive value by another
-// positive value:
-//    ceildiv(a, b) = divis(a + (b - 1), b)
-// where divis is rounding-to-zero division.
-static Value ceilDivPositive(OpBuilder &builder, Location loc, Value dividend,
-                             Value divisor) {
-  assert(dividend.getType().isIndex() && "expected index-typed value");
-
-  Value cstOne = builder.create<ConstantIndexOp>(loc, 1);
-  Value divisorMinusOne = builder.create<SubIOp>(loc, divisor, cstOne);
-  Value sum = builder.create<AddIOp>(loc, dividend, divisorMinusOne);
-  return builder.create<SignedDivIOp>(loc, sum, divisor);
-}
-
 // Hoist the ops within `outer` that appear before `inner`.
 // Such ops include the ops that have been introduced by parametric tiling.
 // Ops that come from triangular loops (i.e. that belong to the program slice
diff --git a/mlir/test/Dialect/Loops/loop-unroll.mlir b/mlir/test/Dialect/Loops/loop-unroll.mlir
new file mode 100644
index 00000000000000..fa3ebc173e510c
--- /dev/null
+++ b/mlir/test/Dialect/Loops/loop-unroll.mlir
@@ -0,0 +1,250 @@
+// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2' | FileCheck %s --check-prefix UNROLL-BY-2
+// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=3' | FileCheck %s --check-prefix UNROLL-BY-3
+// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=0' | FileCheck %s --check-prefix UNROLL-OUTER-BY-2
+// RUN: mlir-opt %s -test-loop-unrolling='unroll-factor=2 loop-depth=1' | FileCheck %s --check-prefix UNROLL-INNER-BY-2
+
+func @dynamic_loop_unroll(%arg0 : index, %arg1 : index, %arg2 : index,
+                          %arg3: memref<?xf32>) {
+  %0 = constant 7.0 : f32
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    store %0, %arg3[%i0] : memref<?xf32>
+  }
+  return
+}
+// UNROLL-BY-2-LABEL: func @dynamic_loop_unroll
+//  UNROLL-BY-2-SAME:  %[[LB:.*0]]: index,
+//  UNROLL-BY-2-SAME:  %[[UB:.*1]]: index,
+//  UNROLL-BY-2-SAME:  %[[STEP:.*2]]: index,
+//  UNROLL-BY-2-SAME:  %[[MEM:.*3]]: memref<?xf32>
+//
+//   UNROLL-BY-2-DAG:  %[[V0:.*]] = subi %[[UB]], %[[LB]] : index
+//   UNROLL-BY-2-DAG:  %[[C1:.*]] = constant 1 : index
+//   UNROLL-BY-2-DAG:  %[[V1:.*]] = subi %[[STEP]], %[[C1]] : index
+//   UNROLL-BY-2-DAG:  %[[V2:.*]] = addi %[[V0]], %[[V1]] : index
+//       Compute trip count in V3.
+//   UNROLL-BY-2-DAG:  %[[V3:.*]] = divi_signed %[[V2]], %[[STEP]] : index
+//       Store unroll factor in C2.
+//   UNROLL-BY-2-DAG:  %[[C2:.*]] = constant 2 : index
+//   UNROLL-BY-2-DAG:  %[[V4:.*]] = remi_signed %[[V3]], %[[C2]] : index
+//   UNROLL-BY-2-DAG:  %[[V5:.*]] = subi %[[V3]], %[[V4]] : index
+//   UNROLL-BY-2-DAG:  %[[V6:.*]] = muli %[[V5]], %[[STEP]] : index
+//       Compute upper bound of unrolled loop in V7.
+//   UNROLL-BY-2-DAG:  %[[V7:.*]] = addi %[[LB]], %[[V6]] : index
+//       Compute step of unrolled loop in V8.
+//   UNROLL-BY-2-DAG:  %[[V8:.*]] = muli %[[STEP]], %[[C2]] : index
+//       UNROLL-BY-2:  loop.for %[[IV:.*]] = %[[LB]] to %[[V7]] step %[[V8]] {
+//  UNROLL-BY-2-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-2-NEXT:    %[[C1_IV:.*]] = constant 1 : index
+//  UNROLL-BY-2-NEXT:    %[[V9:.*]] = muli %[[STEP]], %[[C1_IV]] : index
+//  UNROLL-BY-2-NEXT:    %[[V10:.*]] = addi %[[IV]], %[[V9]] : index
+//  UNROLL-BY-2-NEXT:    store %{{.*}}, %[[MEM]][%[[V10]]] : memref<?xf32>
+//  UNROLL-BY-2-NEXT:  }
+//  UNROLL-BY-2-NEXT:  loop.for %[[IV:.*]] = %[[V7]] to %[[UB]] step %[[STEP]] {
+//  UNROLL-BY-2-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-2-NEXT:  }
+//  UNROLL-BY-2-NEXT:  return
+
+// UNROLL-BY-3-LABEL: func @dynamic_loop_unroll
+//  UNROLL-BY-3-SAME:  %[[LB:.*0]]: index,
+//  UNROLL-BY-3-SAME:  %[[UB:.*1]]: index,
+//  UNROLL-BY-3-SAME:  %[[STEP:.*2]]: index,
+//  UNROLL-BY-3-SAME:  %[[MEM:.*3]]: memref<?xf32>
+//
+//   UNROLL-BY-3-DAG:  %[[V0:.*]] = subi %[[UB]], %[[LB]] : index
+//   UNROLL-BY-3-DAG:  %[[C1:.*]] = constant 1 : index
+//   UNROLL-BY-3-DAG:  %[[V1:.*]] = subi %[[STEP]], %[[C1]] : index
+//   UNROLL-BY-3-DAG:  %[[V2:.*]] = addi %[[V0]], %[[V1]] : index
+//       Compute trip count in V3.
+//   UNROLL-BY-3-DAG:  %[[V3:.*]] = divi_signed %[[V2]], %[[STEP]] : index
+//       Store unroll factor in C3.
+//   UNROLL-BY-3-DAG:  %[[C3:.*]] = constant 3 : index
+//   UNROLL-BY-3-DAG:  %[[V4:.*]] = remi_signed %[[V3]], %[[C3]] : index
+//   UNROLL-BY-3-DAG:  %[[V5:.*]] = subi %[[V3]], %[[V4]] : index
+//   UNROLL-BY-3-DAG:  %[[V6:.*]] = muli %[[V5]], %[[STEP]] : index
+//       Compute upper bound of unrolled loop in V7.
+//   UNROLL-BY-3-DAG:  %[[V7:.*]] = addi %[[LB]], %[[V6]] : index
+//       Compute step of unrolled loop in V8.
+//   UNROLL-BY-3-DAG:  %[[V8:.*]] = muli %[[STEP]], %[[C3]] : index
+//       UNROLL-BY-3:  loop.for %[[IV:.*]] = %[[LB]] to %[[V7]] step %[[V8]] {
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:    %[[C1_IV:.*]] = constant 1 : index
+//  UNROLL-BY-3-NEXT:    %[[V9:.*]] = muli %[[STEP]], %[[C1_IV]] : index
+//  UNROLL-BY-3-NEXT:    %[[V10:.*]] = addi %[[IV]], %[[V9]] : index
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[V10]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:    %[[C2_IV:.*]] = constant 2 : index
+//  UNROLL-BY-3-NEXT:    %[[V11:.*]] = muli %[[STEP]], %[[C2_IV]] : index
+//  UNROLL-BY-3-NEXT:    %[[V12:.*]] = addi %[[IV]], %[[V11]] : index
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[V12]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:  }
+//  UNROLL-BY-3-NEXT:  loop.for %[[IV:.*]] = %[[V7]] to %[[UB]] step %[[STEP]] {
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:  }
+//  UNROLL-BY-3-NEXT:  return
+
+func @dynamic_loop_unroll_outer_by_2(
+  %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index,
+  %arg5 : index, %arg6: memref<?xf32>) {
+  %0 = constant 7.0 : f32
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    loop.for %i1 = %arg3 to %arg4 step %arg5 {
+     store %0, %arg6[%i1] : memref<?xf32>
+    }
+  }
+  return
+}
+// UNROLL-OUTER-BY-2-LABEL: func @dynamic_loop_unroll_outer_by_2
+//  UNROLL-OUTER-BY-2-SAME:  %[[LB0:.*0]]: index,
+//  UNROLL-OUTER-BY-2-SAME:  %[[UB0:.*1]]: index,
+//  UNROLL-OUTER-BY-2-SAME:  %[[STEP0:.*2]]: index,
+//  UNROLL-OUTER-BY-2-SAME:  %[[LB1:.*3]]: index,
+//  UNROLL-OUTER-BY-2-SAME:  %[[UB1:.*4]]: index,
+//  UNROLL-OUTER-BY-2-SAME:  %[[STEP1:.*5]]: index,
+//  UNROLL-OUTER-BY-2-SAME:  %[[MEM:.*6]]: memref<?xf32>
+//
+//       UNROLL-OUTER-BY-2:  loop.for %[[IV0:.*]] = %[[LB0]] to %{{.*}} step %{{.*}} {
+//  UNROLL-OUTER-BY-2-NEXT:    loop.for %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
+//  UNROLL-OUTER-BY-2-NEXT:      store %{{.*}}, %[[MEM]][%[[IV1]]] : memref<?xf32>
+//  UNROLL-OUTER-BY-2-NEXT:    }
+//  UNROLL-OUTER-BY-2-NEXT:    loop.for %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
+//  UNROLL-OUTER-BY-2-NEXT:      store %{{.*}}, %[[MEM]][%[[IV1]]] : memref<?xf32>
+//  UNROLL-OUTER-BY-2-NEXT:    }
+//  UNROLL-OUTER-BY-2-NEXT:  }
+//  UNROLL-OUTER-BY-2-NEXT:  loop.for %[[IV0:.*]] = %{{.*}} to %[[UB0]] step %[[STEP0]] {
+//  UNROLL-OUTER-BY-2-NEXT:    loop.for %[[IV1:.*]] = %[[LB1]] to %[[UB1]] step %[[STEP1]] {
+//  UNROLL-OUTER-BY-2-NEXT:      store %{{.*}}, %[[MEM]][%[[IV1]]] : memref<?xf32>
+//  UNROLL-OUTER-BY-2-NEXT:    }
+//  UNROLL-OUTER-BY-2-NEXT:  }
+//  UNROLL-OUTER-BY-2-NEXT:  return
+
+func @dynamic_loop_unroll_inner_by_2(
+  %arg0 : index, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index,
+  %arg5 : index, %arg6: memref<?xf32>) {
+  %0 = constant 7.0 : f32
+  loop.for %i0 = %arg0 to %arg1 step %arg2 {
+    loop.for %i1 = %arg3 to %arg4 step %arg5 {
+     store %0, %arg6[%i1] : memref<?xf32>
+    }
+  }
+  return
+}
+// UNROLL-INNER-BY-2-LABEL: func @dynamic_loop_unroll_inner_by_2
+//  UNROLL-INNER-BY-2-SAME:  %[[LB0:.*0]]: index,
+//  UNROLL-INNER-BY-2-SAME:  %[[UB0:.*1]]: index,
+//  UNROLL-INNER-BY-2-SAME:  %[[STEP0:.*2]]: index,
+//  UNROLL-INNER-BY-2-SAME:  %[[LB1:.*3]]: index,
+//  UNROLL-INNER-BY-2-SAME:  %[[UB1:.*4]]: index,
+//  UNROLL-INNER-BY-2-SAME:  %[[STEP1:.*5]]: index,
+//  UNROLL-INNER-BY-2-SAME:  %[[MEM:.*6]]: memref<?xf32>
+//
+//       UNROLL-INNER-BY-2:  loop.for %[[IV0:.*]] = %[[LB0]] to %[[UB0]] step %[[STEP0]] {
+//       UNROLL-INNER-BY-2:    loop.for %[[IV1:.*]] = %[[LB1]] to %{{.*}} step %{{.*}} {
+//  UNROLL-INNER-BY-2-NEXT:      store %{{.*}}, %[[MEM]][%[[IV1]]] : memref<?xf32>
+//  UNROLL-INNER-BY-2-NEXT:      %[[C1_IV:.*]] = constant 1 : index
+//  UNROLL-INNER-BY-2-NEXT:      %[[V0:.*]] = muli %[[STEP1]], %[[C1_IV]] : index
+//  UNROLL-INNER-BY-2-NEXT:      %[[V1:.*]] = addi %[[IV1]], %[[V0]] : index
+//  UNROLL-INNER-BY-2-NEXT:      store %{{.*}}, %[[MEM]][%[[V1]]] : memref<?xf32>
+//  UNROLL-INNER-BY-2-NEXT:    }
+//  UNROLL-INNER-BY-2-NEXT:    loop.for %[[IV1:.*]] = %{{.*}} to %[[UB1]] step %[[STEP1]] {
+//  UNROLL-INNER-BY-2-NEXT:      store %{{.*}}, %[[MEM]][%[[IV1]]] : memref<?xf32>
+//  UNROLL-INNER-BY-2-NEXT:    }
+//  UNROLL-INNER-BY-2-NEXT:  }
+//  UNROLL-INNER-BY-2-NEXT:  return
+
+// Test that no epilogue clean-up loop is generated because the trip count is
+// a multiple of the unroll factor.
+func @static_loop_unroll_by_2(%arg0 : memref<?xf32>) {
+  %0 = constant 7.0 : f32
+  %lb = constant 0 : index
+  %ub = constant 20 : index
+  %step = constant 1 : index
+  loop.for %i0 = %lb to %ub step %step {
+    store %0, %arg0[%i0] : memref<?xf32>
+  }
+  return
+}
+// UNROLL-BY-2-LABEL: func @static_loop_unroll_by_2
+//  UNROLL-BY-2-SAME:  %[[MEM:.*0]]: memref<?xf32>
+//
+//   UNROLL-BY-2-DAG:  %[[C0:.*]] = constant 0 : index
+//   UNROLL-BY-2-DAG:  %[[C1:.*]] = constant 1 : index
+//   UNROLL-BY-2-DAG:  %[[C20:.*]] = constant 20 : index
+//   UNROLL-BY-2-DAG:  %[[C2:.*]] = constant 2 : index
+//   UNROLL-BY-2:  loop.for %[[IV:.*]] = %[[C0]] to %[[C20]] step %[[C2]] {
+//  UNROLL-BY-2-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-2-NEXT:    %[[C1_IV:.*]] = constant 1 : index
+//  UNROLL-BY-2-NEXT:    %[[V0:.*]] = muli %[[C1]], %[[C1_IV]] : index
+//  UNROLL-BY-2-NEXT:    %[[V1:.*]] = addi %[[IV]], %[[V0]] : index
+//  UNROLL-BY-2-NEXT:    store %{{.*}}, %[[MEM]][%[[V1]]] : memref<?xf32>
+//  UNROLL-BY-2-NEXT:  }
+//  UNROLL-BY-2-NEXT:  return
+
+// Test that epilogue clean up loop is generated (trip count is not
+// a multiple of unroll factor).
+func @static_loop_unroll_by_3(%arg0 : memref<?xf32>) {
+  %0 = constant 7.0 : f32
+  %lb = constant 0 : index
+  %ub = constant 20 : index
+  %step = constant 1 : index
+  loop.for %i0 = %lb to %ub step %step {
+    store %0, %arg0[%i0] : memref<?xf32>
+  }
+  return
+}
+
+// UNROLL-BY-3-LABEL: func @static_loop_unroll_by_3
+//  UNROLL-BY-3-SAME:  %[[MEM:.*0]]: memref<?xf32>
+//
+//   UNROLL-BY-3-DAG:  %[[C0:.*]] = constant 0 : index
+//   UNROLL-BY-3-DAG:  %[[C1:.*]] = constant 1 : index
+//   UNROLL-BY-3-DAG:  %[[C20:.*]] = constant 20 : index
+//   UNROLL-BY-3-DAG:  %[[C18:.*]] = constant 18 : index
+//   UNROLL-BY-3-DAG:  %[[C3:.*]] = constant 3 : index
+//       UNROLL-BY-3: loop.for %[[IV:.*]] = %[[C0]] to %[[C18]] step %[[C3]] {
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:    %[[C1_IV:.*]] = constant 1 : index
+//  UNROLL-BY-3-NEXT:    %[[V0:.*]] = muli %[[C1]], %[[C1_IV]] : index
+//  UNROLL-BY-3-NEXT:    %[[V1:.*]] = addi %[[IV]], %[[V0]] : index
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[V1]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:    %[[C2_IV:.*]] = constant 2 : index
+//  UNROLL-BY-3-NEXT:    %[[V2:.*]] = muli %[[C1]], %[[C2_IV]] : index
+//  UNROLL-BY-3-NEXT:    %[[V3:.*]] = addi %[[IV]], %[[V2]] : index
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[V3]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:  }
+//  UNROLL-BY-3-NEXT:  loop.for %[[IV:.*]] = %[[C18]] to %[[C20]] step %[[C1]] {
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:  }
+//  UNROLL-BY-3-NEXT:  return
+
+// Test that the single iteration epilogue loop body is promoted to the loops
+// containing block.
+func @static_loop_unroll_by_3_promote_epilogue(%arg0 : memref<?xf32>) {
+  %0 = constant 7.0 : f32
+  %lb = constant 0 : index
+  %ub = constant 10 : index
+  %step = constant 1 : index
+  loop.for %i0 = %lb to %ub step %step {
+    store %0, %arg0[%i0] : memref<?xf32>
+  }
+  return
+}
+// UNROLL-BY-3-LABEL: func @static_loop_unroll_by_3_promote_epilogue
+//  UNROLL-BY-3-SAME:  %[[MEM:.*0]]: memref<?xf32>
+//
+//   UNROLL-BY-3-DAG:  %[[C0:.*]] = constant 0 : index
+//   UNROLL-BY-3-DAG:  %[[C1:.*]] = constant 1 : index
+//   UNROLL-BY-3-DAG:  %[[C10:.*]] = constant 10 : index
+//   UNROLL-BY-3-DAG:  %[[C9:.*]] = constant 9 : index
+//   UNROLL-BY-3-DAG:  %[[C3:.*]] = constant 3 : index
+//       UNROLL-BY-3: loop.for %[[IV:.*]] = %[[C0]] to %[[C9]] step %[[C3]] {
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[IV]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:    %[[C1_IV:.*]] = constant 1 : index
+//  UNROLL-BY-3-NEXT:    %[[V0:.*]] = muli %[[C1]], %[[C1_IV]] : index
+//  UNROLL-BY-3-NEXT:    %[[V1:.*]] = addi %[[IV]], %[[V0]] : index
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[V1]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:    %[[C2_IV:.*]] = constant 2 : index
+//  UNROLL-BY-3-NEXT:    %[[V2:.*]] = muli %[[C1]], %[[C2_IV]] : index
+//  UNROLL-BY-3-NEXT:    %[[V3:.*]] = addi %[[IV]], %[[V2]] : index
+//  UNROLL-BY-3-NEXT:    store %{{.*}}, %[[MEM]][%[[V3]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:  }
+//  UNROLL-BY-3-NEXT:  store %{{.*}}, %[[MEM]][%[[C9]]] : memref<?xf32>
+//  UNROLL-BY-3-NEXT:  return
diff --git a/mlir/test/Examples/standalone/lit.local.cfg b/mlir/test/Examples/standalone/lit.local.cfg
new file mode 100644
index 00000000000000..481b809a0e486a
--- /dev/null
+++ b/mlir/test/Examples/standalone/lit.local.cfg
@@ -0,0 +1,3 @@
+config.substitutions.append(("%cmake", config.host_cmake))
+config.substitutions.append(("%host_cxx", config.host_cxx))
+config.substitutions.append(("%host_cc", config.host_cc))
diff --git a/mlir/test/Examples/standalone/test.toy b/mlir/test/Examples/standalone/test.toy
new file mode 100644
index 00000000000000..4f9ba5cc78e114
--- /dev/null
+++ b/mlir/test/Examples/standalone/test.toy
@@ -0,0 +1,4 @@
+# RUN: %cmake %mlir_src_root/examples/standalone -DCMAKE_CXX_COMPILER=%host_cxx -DCMAKE_C_COMPILER=%host_cc -DMLIR_DIR=%llvm_lib_dir/cmake/mlir ; %cmake --build . --target check-standalone-opt | tee %t | FileCheck %s
+
+# CHECK: Expected Passes: 1
+# UNSUPPORTED: windows, android
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
index 2145c1bbc17220..2a14c3ae6c419c 100644
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -303,6 +303,13 @@ func @invalid_cmp_shape(%idx : () -> ()) {
 
 // -----
 
+func @dma_start_not_enough_operands() {
+  // expected-error@+1 {{expected at least 4 operands}}
+  "std.dma_start"() : () -> ()
+}
+
+// -----
+
 func @dma_no_src_memref(%m : f32, %tag : f32, %c0 : index) {
   // expected-error@+1 {{expected source to be of memref type}}
   dma_start %m[%c0], %m[%c0], %c0, %tag[%c0] : f32, f32, f32
@@ -310,6 +317,24 @@ func @dma_no_src_memref(%m : f32, %tag : f32, %c0 : index) {
 
 // -----
 
+func @dma_start_not_enough_operands_for_src(
+    %src: memref<2x2x2xf32>, %idx: index) {
+  // expected-error@+1 {{expected at least 7 operands}}
+  "std.dma_start"(%src, %idx, %idx, %idx) : (memref<2x2x2xf32>, index, index, index) -> ()
+}
+
+// -----
+
+func @dma_start_src_index_wrong_type(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<i32,2>, %flt: f32) {
+  // expected-error@+1 {{expected source indices to be of index type}}
+  "std.dma_start"(%src, %idx, %flt, %dst, %idx, %tag, %idx)
+      : (memref<2x2xf32>, index, f32, memref<2xf32,1>, index, memref<i32,2>, index) -> ()
+}
+
+// -----
+
 func @dma_no_dst_memref(%m : f32, %tag : f32, %c0 : index) {
   %mref = alloc() : memref<8 x f32>
   // expected-error@+1 {{expected destination to be of memref type}}
@@ -318,6 +343,36 @@ func @dma_no_dst_memref(%m : f32, %tag : f32, %c0 : index) {
 
 // -----
 
+func @dma_start_not_enough_operands_for_dst(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<i32,2>) {
+  // expected-error@+1 {{expected at least 7 operands}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index) -> ()
+}
+
+// -----
+
+func @dma_start_dst_index_wrong_type(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<i32,2>, %flt: f32) {
+  // expected-error@+1 {{expected destination indices to be of index type}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %flt, %tag, %idx)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, f32, memref<i32,2>, index) -> ()
+}
+
+// -----
+
+func @dma_start_dst_index_wrong_type(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<i32,2>, %flt: f32) {
+  // expected-error@+1 {{expected num elements to be of index type}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %idx, %flt, %tag)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, f32, memref<i32,2>) -> ()
+}
+
+// -----
+
 func @dma_no_tag_memref(%tag : f32, %c0 : index) {
   %mref = alloc() : memref<8 x f32>
   // expected-error@+1 {{expected tag to be of memref type}}
@@ -326,9 +381,80 @@ func @dma_no_tag_memref(%tag : f32, %c0 : index) {
 
 // -----
 
+func @dma_start_not_enough_operands_for_tag(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<2xi32,2>) {
+  // expected-error@+1 {{expected at least 8 operands}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref<2xi32,2>) -> ()
+}
+
+// -----
+
+func @dma_start_dst_index_wrong_type(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<2xi32,2>, %flt: f32) {
+  // expected-error@+1 {{expected tag indices to be of index type}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag, %flt)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref<2xi32,2>, f32) -> ()
+}
+
+// -----
+
+func @dma_start_same_space(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32>,
+    %tag: memref<i32,2>) {
+  // expected-error@+1 {{DMA should be between different memory spaces}}
+  dma_start %src[%idx, %idx], %dst[%idx], %idx, %tag[] : memref<2x2xf32>, memref<2xf32>, memref<i32,2>
+}
+
+// -----
+
+func @dma_start_too_many_operands(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<i32,2>) {
+  // expected-error@+1 {{incorrect number of operands}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag, %idx, %idx, %idx)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref<i32,2>, index, index, index) -> ()
+}
+
+
+// -----
+
+func @dma_start_wrong_stride_type(
+    %src: memref<2x2xf32>, %idx: index, %dst: memref<2xf32,1>,
+    %tag: memref<i32,2>, %flt: f32) {
+  // expected-error@+1 {{expected stride and num elements per stride to be of type index}}
+  "std.dma_start"(%src, %idx, %idx, %dst, %idx, %idx, %tag, %idx, %flt)
+      : (memref<2x2xf32>, index, index, memref<2xf32,1>, index, index, memref<i32,2>, index, f32) -> ()
+}
+
+// -----
+
+func @dma_wait_not_enough_operands() {
+  // expected-error@+1 {{expected at least 2 operands}}
+  "std.dma_wait"() : () -> ()
+}
+
+// -----
+
 func @dma_wait_no_tag_memref(%tag : f32, %c0 : index) {
   // expected-error@+1 {{expected tag to be of memref type}}
-  dma_wait %tag[%c0], %arg0 : f32
+  "std.dma_wait"(%tag, %c0, %c0) : (f32, index, index) -> ()
+}
+
+// -----
+
+func @dma_wait_wrong_index_type(%tag : memref<2xi32>, %idx: index, %flt: f32) {
+  // expected-error@+1 {{expected tag indices to be of index type}}
+  "std.dma_wait"(%tag, %flt, %idx) : (memref<2xi32>, f32, index) -> ()
+}
+
+// -----
+
+func @dma_wait_wrong_num_elements_type(%tag : memref<2xi32>, %idx: index, %flt: f32) {
+  // expected-error@+1 {{expected the number of elements to be of index type}}
+  "std.dma_wait"(%tag, %idx, %flt) : (memref<2xi32>, index, f32) -> ()
 }
 
 // -----
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index 248da51bcec722..e7b31b3d0bcfec 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -14,6 +14,7 @@ add_mlir_library(MLIRTestTransforms
   TestLiveness.cpp
   TestLoopMapping.cpp
   TestLoopParametricTiling.cpp
+  TestLoopUnrolling.cpp
   TestOpaqueLoc.cpp
   TestMemRefBoundCheck.cpp
   TestMemRefDependenceCheck.cpp
diff --git a/mlir/test/lib/Transforms/TestLoopUnrolling.cpp b/mlir/test/lib/Transforms/TestLoopUnrolling.cpp
new file mode 100644
index 00000000000000..7cd221f37f8c00
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestLoopUnrolling.cpp
@@ -0,0 +1,68 @@
+//===-------- TestLoopUnrolling.cpp --- loop unrolling test pass ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to unroll loops by a specified unroll factor.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
+
+using namespace mlir;
+
+namespace {
+
+static unsigned getNestingDepth(Operation *op) {
+  Operation *currOp = op;
+  unsigned depth = 0;
+  while ((currOp = currOp->getParentOp())) {
+    if (isa<loop::ForOp>(currOp))
+      depth++;
+  }
+  return depth;
+}
+
+class TestLoopUnrollingPass
+    : public PassWrapper<TestLoopUnrollingPass, FunctionPass> {
+public:
+  TestLoopUnrollingPass() = default;
+  TestLoopUnrollingPass(const TestLoopUnrollingPass &) {}
+  explicit TestLoopUnrollingPass(uint64_t unrollFactorParam,
+                                 unsigned loopDepthParam) {
+    unrollFactor = unrollFactorParam;
+    loopDepth = loopDepthParam;
+  }
+
+  void runOnFunction() override {
+    FuncOp func = getFunction();
+    SmallVector<loop::ForOp, 4> loops;
+    func.walk([&](loop::ForOp forOp) {
+      if (getNestingDepth(forOp) == loopDepth)
+        loops.push_back(forOp);
+    });
+    for (auto loop : loops) {
+      loopUnrollByFactor(loop, unrollFactor);
+    }
+  }
+  Option<uint64_t> unrollFactor{*this, "unroll-factor",
+                                llvm::cl::desc("Loop unroll factor."),
+                                llvm::cl::init(1)};
+  Option<unsigned> loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."),
+                             llvm::cl::init(0)};
+};
+} // end namespace
+
+namespace mlir {
+void registerTestLoopUnrollingPass() {
+  PassRegistration<TestLoopUnrollingPass>(
+      "test-loop-unrolling", "Tests loop unrolling transformation");
+}
+} // namespace mlir
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 65f80315d57aa4..e78c82815b15a6 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -31,6 +31,7 @@
 
 config.substitutions.append(('%PATH%', config.environment['PATH']))
 config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
+config.substitutions.append(("%mlir_src_root", config.mlir_src_root))
 
 llvm_config.with_system_environment(
     ['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index dafb1c9a3eb861..dc6286a827bb73 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -23,6 +23,7 @@ config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
 config.host_os = "@HOST_OS@"
 config.host_cc = "@HOST_CC@"
 config.host_cxx = "@HOST_CXX@"
+config.host_cmake = "@CMAKE_COMMAND@"
 # Note: ldflags can contain double-quoted paths, so must use single quotes here.
 config.host_ldflags = '@HOST_LDFLAGS@'
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 2504b04420b79a..ca39f37a8d8d03 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -30,7 +30,6 @@ set(LIBS
   MLIRTestTransforms
   MLIRSupport
   MLIRIR
-  MLIROptLib
   )
 
 # Exclude from libMLIR.so because this has static options intended for
@@ -42,6 +41,10 @@ add_mlir_library(MLIRMlirOptMain
 
   LINK_LIBS
   ${LIBS}
+
+  DEPENDS
+  intrinsics_gen
+  mlir-headers
   )
 
 add_llvm_tool(mlir-opt
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index c5cc533ab1199a..9d583dc2a3198a 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -53,6 +53,7 @@ void registerTestLinalgTransforms();
 void registerTestLivenessPass();
 void registerTestLoopFusion();
 void registerTestLoopMappingPass();
+void registerTestLoopUnrollingPass();
 void registerTestMatchers();
 void registerTestMemRefDependenceCheck();
 void registerTestMemRefStrideCalculation();
@@ -119,6 +120,7 @@ void registerTestPasses() {
   registerTestLivenessPass();
   registerTestLoopFusion();
   registerTestLoopMappingPass();
+  registerTestLoopUnrollingPass();
   registerTestMatchers();
   registerTestMemRefDependenceCheck();
   registerTestMemRefStrideCalculation();