diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp
index 2d30c2d49419b1..0e2fb9f5f3347d 100644
--- a/compiler-rt/lib/dfsan/dfsan.cpp
+++ b/compiler-rt/lib/dfsan/dfsan.cpp
@@ -158,12 +158,6 @@ static void dfsan_check_label(dfsan_label label) {
   }
 }
 
-static void ReportUnsupportedFast16(const char *func) {
-  Report("FATAL: DataFlowSanitizer: %s is unsupported in fast16labels mode\n",
-         func);
-  Die();
-}
-
 // Resolves the union of two unequal labels.  Nonequality is a precondition for
 // this function (the instrumentation pass inlines the equality test).
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
@@ -259,10 +253,8 @@ dfsan_union(dfsan_label l1, dfsan_label l2) {
 
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 dfsan_label dfsan_create_label(const char *desc, void *userdata) {
-  if (flags().fast16labels)
-    ReportUnsupportedFast16("dfsan_create_label");
   dfsan_label label =
-      atomic_fetch_add(&__dfsan_last_label, 1, memory_order_relaxed) + 1;
+    atomic_fetch_add(&__dfsan_last_label, 1, memory_order_relaxed) + 1;
   dfsan_check_label(label);
   __dfsan_label_info[label].l1 = __dfsan_label_info[label].l2 = 0;
   __dfsan_label_info[label].desc = desc;
@@ -319,15 +311,11 @@ dfsan_read_label(const void *addr, uptr size) {
 
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE
 const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label) {
-  if (flags().fast16labels)
-    ReportUnsupportedFast16("dfsan_get_label_info");
   return &__dfsan_label_info[label];
 }
 
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE int
 dfsan_has_label(dfsan_label label, dfsan_label elem) {
-  if (flags().fast16labels)
-    return label & elem;
   if (label == elem)
     return true;
   const dfsan_label_info *info = dfsan_get_label_info(label);
@@ -340,8 +328,6 @@ dfsan_has_label(dfsan_label label, dfsan_label elem) {
 
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label
 dfsan_has_label_with_desc(dfsan_label label, const char *desc) {
-  if (flags().fast16labels)
-    ReportUnsupportedFast16("dfsan_has_label_with_desc");
   const dfsan_label_info *info = dfsan_get_label_info(label);
   if (info->l1 != 0) {
     return dfsan_has_label_with_desc(info->l1, desc) ||
@@ -361,11 +347,9 @@ dfsan_get_label_count(void) {
 
 extern "C" SANITIZER_INTERFACE_ATTRIBUTE void
 dfsan_dump_labels(int fd) {
-  if (flags().fast16labels)
-    ReportUnsupportedFast16("dfsan_dump_labels");
-
   dfsan_label last_label =
       atomic_load(&__dfsan_last_label, memory_order_relaxed);
+
   for (uptr l = 1; l <= last_label; ++l) {
     char buf[64];
     internal_snprintf(buf, sizeof(buf), "%u %u %u ", l,
diff --git a/compiler-rt/lib/fuzzer/FuzzerDefs.h b/compiler-rt/lib/fuzzer/FuzzerDefs.h
index 1a2752af2f4d56..fb9eccd46324c1 100644
--- a/compiler-rt/lib/fuzzer/FuzzerDefs.h
+++ b/compiler-rt/lib/fuzzer/FuzzerDefs.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <vector>
 
+#include "FuzzerPlatform.h"
 
 namespace fuzzer {
 
@@ -62,7 +63,8 @@ typedef Vector<uint8_t> Unit;
 typedef Vector<Unit> UnitVector;
 typedef int (*UserCallback)(const uint8_t *Data, size_t Size);
 
-int FuzzerDriver(int *argc, char ***argv, UserCallback Callback);
+ATTRIBUTE_INTERFACE int FuzzerDriver(int *argc, char ***argv,
+                                     UserCallback Callback);
 
 uint8_t *ExtraCountersBegin();
 uint8_t *ExtraCountersEnd();
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index ae085befc4f152..08b12bd5b6e6b4 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -196,6 +196,10 @@ class Allocator {
 #endif // GWP_ASAN_HOOKS
   }
 
+  ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) {
+    TSDRegistry.initThreadMaybe(this, MinimalInit);
+  }
+
   void reset() { memset(this, 0, sizeof(*this)); }
 
   void unmapTestOnly() {
@@ -977,10 +981,6 @@ class Allocator {
            reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
   }
 
-  ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) {
-    TSDRegistry.initThreadMaybe(this, MinimalInit);
-  }
-
   void quarantineOrDeallocateChunk(void *Ptr, Chunk::UnpackedHeader *Header,
                                    uptr Size) {
     Chunk::UnpackedHeader NewHeader = *Header;
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 7e04afb90bb1f1..005decdaa82904 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -67,15 +67,17 @@ void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size,
       "");
 }
 
+template <typename Config> struct TestAllocator : scudo::Allocator<Config> {
+  TestAllocator() {
+    this->reset();
+    this->initThreadMaybe();
+  }
+  ~TestAllocator() { this->unmapTestOnly(); }
+};
+
 template <class Config> static void testAllocator() {
-  using AllocatorT = scudo::Allocator<Config>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<Config>;
+  auto Allocator = std::make_unique<AllocatorT>();
 
   EXPECT_FALSE(Allocator->isOwned(&Mutex));
   EXPECT_FALSE(Allocator->isOwned(&Allocator));
@@ -348,14 +350,8 @@ template <typename AllocatorT> static void stressAllocator(AllocatorT *A) {
 }
 
 template <class Config> static void testAllocatorThreaded() {
-  using AllocatorT = scudo::Allocator<Config>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<Config>;
+  auto Allocator = std::make_unique<AllocatorT>();
   std::thread Threads[32];
   for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++)
     Threads[I] = std::thread(stressAllocator<AllocatorT>, Allocator.get());
@@ -401,14 +397,8 @@ struct DeathConfig {
 };
 
 TEST(ScudoCombinedTest, DeathCombined) {
-  using AllocatorT = scudo::Allocator<DeathConfig>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<DeathConfig>;
+  auto Allocator = std::make_unique<AllocatorT>();
 
   const scudo::uptr Size = 1000U;
   void *P = Allocator->allocate(Size, Origin);
@@ -442,14 +432,8 @@ TEST(ScudoCombinedTest, DeathCombined) {
 // Ensure that releaseToOS can be called prior to any other allocator
 // operation without issue.
 TEST(ScudoCombinedTest, ReleaseToOS) {
-  using AllocatorT = scudo::Allocator<DeathConfig>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<DeathConfig>;
+  auto Allocator = std::make_unique<AllocatorT>();
 
   Allocator->releaseToOS();
 }
@@ -457,14 +441,8 @@ TEST(ScudoCombinedTest, ReleaseToOS) {
 // Verify that when a region gets full, the allocator will still manage to
 // fulfill the allocation through a larger size class.
 TEST(ScudoCombinedTest, FullRegion) {
-  using AllocatorT = scudo::Allocator<DeathConfig>;
-  auto Deleter = [](AllocatorT *A) {
-    A->unmapTestOnly();
-    delete A;
-  };
-  std::unique_ptr<AllocatorT, decltype(Deleter)> Allocator(new AllocatorT,
-                                                           Deleter);
-  Allocator->reset();
+  using AllocatorT = TestAllocator<DeathConfig>;
+  auto Allocator = std::make_unique<AllocatorT>();
 
   std::vector<void *> V;
   scudo::uptr FailedAllocationsCount = 0;
diff --git a/compiler-rt/test/dfsan/fast16labels.c b/compiler-rt/test/dfsan/fast16labels.c
index 90c039234c336a..a9b71603954045 100644
--- a/compiler-rt/test/dfsan/fast16labels.c
+++ b/compiler-rt/test/dfsan/fast16labels.c
@@ -1,13 +1,4 @@
-// RUN: %clang_dfsan %s -o %t
-// RUN: DFSAN_OPTIONS=fast16labels=1 %run %t
-// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_create_label 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CREATE-LABEL
-// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_get_label_info 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=GET-LABEL-INFO
-// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_has_label_with_desc \
-// RUN:   2>&1 | FileCheck %s --check-prefix=HAS-LABEL-WITH-DESC
-// RUN: DFSAN_OPTIONS=fast16labels=1:dump_labels_at_exit=/dev/stdout not %run \
-// RUN:   %t 2>&1 | FileCheck %s --check-prefix=DUMP-LABELS
+// RUN: %clang_dfsan %s -o %t && DFSAN_OPTIONS=fast16labels=1 %run %t
 //
 // Tests DFSAN_OPTIONS=fast16labels=1
 //
@@ -15,45 +6,20 @@
 
 #include <assert.h>
 #include <stdio.h>
-#include <string.h>
 
 int foo(int a, int b) {
   return a + b;
 }
 
-int main(int argc, char *argv[]) {
-  // Death tests for unsupported API usage.
-  const char *command = (argc < 2) ? "" : argv[1];
-  fprintf(stderr, "Running with command %s\n", command);
-  // CREATE-LABEL: FATAL: DataFlowSanitizer: dfsan_create_label is unsupported
-  if (strcmp(command, "dfsan_create_label") == 0)
-    dfsan_create_label("", NULL);
-  // GET-LABEL-INFO: FATAL: DataFlowSanitizer: dfsan_get_label_info is unsupported
-  if (strcmp(command, "dfsan_get_label_info") == 0)
-    dfsan_get_label_info(1);
-  // HAS-LABEL-WITH-DESC: FATAL: DataFlowSanitizer: dfsan_has_label_with_desc is unsupported
-  if (strcmp(command, "dfsan_has_label_with_desc") == 0)
-    dfsan_has_label_with_desc(1, "");
-  // DUMP-LABELS: FATAL: DataFlowSanitizer: dfsan_dump_labels is unsupported
-
-  // Supported usage.
+int main() {
   int a = 10;
   int b = 20;
   dfsan_set_label(8, &a, sizeof(a));
   dfsan_set_label(512, &b, sizeof(b));
   int c = foo(a, b);
-  fprintf(stderr, "A: 0x%x\n", dfsan_get_label(a));
-  fprintf(stderr, "B: 0x%x\n", dfsan_get_label(b));
+  printf("A: 0x%x\n", dfsan_get_label(a));
+  printf("B: 0x%x\n", dfsan_get_label(b));
   dfsan_label l = dfsan_get_label(c);
-  fprintf(stderr, "C: 0x%x\n", l);
-  fprintf(stderr, "Testing l == 520\n");
+  printf("C: 0x%x\n", l);
   assert(l == 520);  // OR of the other two labels.
-  fprintf(stderr, "Testing dfsan_has_label(l, 8)\n");
-  assert(dfsan_has_label(l, 8));
-  fprintf(stderr, "Testing dfsan_has_label(l, 512)\n");
-  assert(dfsan_has_label(l, 512));
-  fprintf(stderr, "Testing !dfsan_has_label(l, 1)\n");
-  assert(!dfsan_has_label(l, 1));
-  fprintf(stderr, "returning...\n");
-  return 0;
 }
diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index 0148e78fed09c2..a3e6f177f91de6 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -5,6 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef FORTRAN_LOWER_OPENACC_H
 #define FORTRAN_LOWER_OPENACC_H
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 805062eb297f5c..7202d4ec031991 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -5,6 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
 
 #include "flang/Lower/OpenACC.h"
 #include "flang/Lower/Bridge.h"
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 4a0aa28c9d2b2c..6a1ff3bd64a9ba 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -32,6 +32,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.floor
     libc.src.math.floorf
     libc.src.math.floorl
+    libc.src.math.fmax
+    libc.src.math.fmaxf
+    libc.src.math.fmaxl
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 7fc199eabc6bf0..1ec1a024f85d09 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -163,6 +163,9 @@ def MathAPI : PublicAPI<"math.h"> {
    "floor",
    "floorf",
    "floorl",
+   "fmax",
+   "fmaxf",
+   "fmaxl",
    "fmin",
    "fminf",
    "fminl",
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 04acfb31da04bd..b20f58c4518471 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -68,6 +68,9 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
+    libc.src.math.fmax
+    libc.src.math.fmaxf
+    libc.src.math.fmaxl
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index cdeee89be50713..6a11b002d87429 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -209,6 +209,10 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fminf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fminl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
 
+          FunctionSpec<"fmax", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
+          FunctionSpec<"fmaxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
+          FunctionSpec<"fmaxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
+
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index d2d694807168ea..da18aeba9a2a50 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -449,3 +449,39 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     -O2
 )
+
+add_entrypoint_object(
+  fmax
+  SRCS
+    fmax.cpp
+  HDRS
+    fmax.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaxf
+  SRCS
+    fmaxf.cpp
+  HDRS
+    fmaxf.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fmaxl
+  SRCS
+    fmaxl.cpp
+  HDRS
+    fmaxl.h
+  DEPENDS
+    libc.utils.FPUtil.fputil
+  COMPILE_OPTIONS
+    -O2
+)
diff --git a/libc/src/math/fmax.cpp b/libc/src/math/fmax.cpp
new file mode 100644
index 00000000000000..ba5d93388998eb
--- /dev/null
+++ b/libc/src/math/fmax.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of fmax function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/BasicOperations.h"
+
+namespace __llvm_libc {
+
+double LLVM_LIBC_ENTRYPOINT(fmax)(double x, double y) {
+  return fputil::fmax(x, y);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/fmax.h b/libc/src/math/fmax.h
new file mode 100644
index 00000000000000..9f057983d28bb7
--- /dev/null
+++ b/libc/src/math/fmax.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmax --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMAX_H
+#define LLVM_LIBC_SRC_MATH_FMAX_H
+
+namespace __llvm_libc {
+
+double fmax(double x, double y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMAX_H
diff --git a/libc/src/math/fmaxf.cpp b/libc/src/math/fmaxf.cpp
new file mode 100644
index 00000000000000..55629040eba665
--- /dev/null
+++ b/libc/src/math/fmaxf.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of fmaxf function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/BasicOperations.h"
+
+namespace __llvm_libc {
+
+float LLVM_LIBC_ENTRYPOINT(fmaxf)(float x, float y) {
+  return fputil::fmax(x, y);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/fmaxf.h b/libc/src/math/fmaxf.h
new file mode 100644
index 00000000000000..e37df5cf9565d4
--- /dev/null
+++ b/libc/src/math/fmaxf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmaxf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMAXF_H
+#define LLVM_LIBC_SRC_MATH_FMAXF_H
+
+namespace __llvm_libc {
+
+float fmaxf(float x, float y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMAXF_H
diff --git a/libc/src/math/fmaxl.cpp b/libc/src/math/fmaxl.cpp
new file mode 100644
index 00000000000000..c944187f26b7a6
--- /dev/null
+++ b/libc/src/math/fmaxl.cpp
@@ -0,0 +1,18 @@
+//===-- Implementation of fmaxl function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/common.h"
+#include "utils/FPUtil/BasicOperations.h"
+
+namespace __llvm_libc {
+
+long double LLVM_LIBC_ENTRYPOINT(fmaxl)(long double x, long double y) {
+  return fputil::fmax(x, y);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/math/fmaxl.h b/libc/src/math/fmaxl.h
new file mode 100644
index 00000000000000..41d80ba4aa52ca
--- /dev/null
+++ b/libc/src/math/fmaxl.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmaxl -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMAXL_H
+#define LLVM_LIBC_SRC_MATH_FMAXL_H
+
+namespace __llvm_libc {
+
+long double fmaxl(long double x, long double y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMAXL_H
diff --git a/libc/src/math/fminf.cpp b/libc/src/math/fminf.cpp
index 8f5b54873f8f50..97ad399f4a2bcf 100644
--- a/libc/src/math/fminf.cpp
+++ b/libc/src/math/fminf.cpp
@@ -15,4 +15,4 @@ float LLVM_LIBC_ENTRYPOINT(fminf)(float x, float y) {
   return fputil::fmin(x, y);
 }
 
-} // namespace __llvm_libc
\ No newline at end of file
+} // namespace __llvm_libc
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index a6898b3706213f..3bd40f6b32f14a 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -472,4 +472,40 @@ add_math_unittest(
     libc.include.math
     libc.src.math.fminl
     libc.utils.FPUtil.fputil
-)
\ No newline at end of file
+)
+
+add_math_unittest(
+  fmaxf_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmaxf_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmaxf
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  fmax_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmax_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmax
+    libc.utils.FPUtil.fputil
+)
+
+add_math_unittest(
+  fmaxl_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmaxl_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmaxl
+    libc.utils.FPUtil.fputil
+)
diff --git a/libc/test/src/math/fmax_test.cpp b/libc/test/src/math/fmax_test.cpp
new file mode 100644
index 00000000000000..8d1c6c2763821a
--- /dev/null
+++ b/libc/test/src/math/fmax_test.cpp
@@ -0,0 +1,73 @@
+//===-- Unittests for fmax -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/fmax.h"
+#include "utils/FPUtil/FPBits.h"
+#include "utils/UnitTest/Test.h"
+
+using FPBits = __llvm_libc::fputil::FPBits<double>;
+
+double nan = FPBits::buildNaN(1);
+double inf = FPBits::inf();
+double negInf = FPBits::negInf();
+
+TEST(FmaxTest, NaNArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmax(nan, inf));
+  EXPECT_EQ(negInf, __llvm_libc::fmax(negInf, nan));
+  EXPECT_EQ(0.0, __llvm_libc::fmax(nan, 0.0));
+  EXPECT_EQ(-0.0, __llvm_libc::fmax(-0.0, nan));
+  EXPECT_EQ(-1.2345, __llvm_libc::fmax(nan, -1.2345));
+  EXPECT_EQ(1.2345, __llvm_libc::fmax(1.2345, nan));
+  EXPECT_NE(isnan(__llvm_libc::fmax(nan, nan)), 0);
+}
+
+TEST(FmaxTest, InfArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmax(negInf, inf));
+  EXPECT_EQ(inf, __llvm_libc::fmax(inf, 0.0));
+  EXPECT_EQ(inf, __llvm_libc::fmax(-0.0, inf));
+  EXPECT_EQ(inf, __llvm_libc::fmax(inf, 1.2345));
+  EXPECT_EQ(inf, __llvm_libc::fmax(-1.2345, inf));
+}
+
+TEST(FmaxTest, NegInfArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmax(inf, negInf));
+  EXPECT_EQ(0.0, __llvm_libc::fmax(negInf, 0.0));
+  EXPECT_EQ(-0.0, __llvm_libc::fmax(-0.0, negInf));
+  EXPECT_EQ(-1.2345, __llvm_libc::fmax(negInf, -1.2345));
+  EXPECT_EQ(1.2345, __llvm_libc::fmax(1.2345, negInf));
+}
+
+TEST(FmaxTest, BothZero) {
+  EXPECT_EQ(0.0, __llvm_libc::fmax(0.0, 0.0));
+  EXPECT_EQ(0.0, __llvm_libc::fmax(-0.0, 0.0));
+  EXPECT_EQ(0.0, __llvm_libc::fmax(0.0, -0.0));
+  EXPECT_EQ(-0.0, __llvm_libc::fmax(-0.0, -0.0));
+}
+
+TEST(FmaxTest, InDoubleRange) {
+  using UIntType = FPBits::UIntType;
+  constexpr UIntType count = 10000001;
+  constexpr UIntType step = UIntType(-1) / count;
+  for (UIntType i = 0, v = 0, w = UIntType(-1); i <= count;
+       ++i, v += step, w -= step) {
+    double x = FPBits(v), y = FPBits(w);
+    if (isnan(x) || isinf(x))
+      continue;
+    if (isnan(y) || isinf(y))
+      continue;
+    if ((x == 0) && (y == 0))
+      continue;
+
+    if (x > y) {
+      ASSERT_EQ(x, __llvm_libc::fmax(x, y));
+    } else {
+      ASSERT_EQ(y, __llvm_libc::fmax(x, y));
+    }
+  }
+}
diff --git a/libc/test/src/math/fmaxf_test.cpp b/libc/test/src/math/fmaxf_test.cpp
new file mode 100644
index 00000000000000..fe9eaad171bd28
--- /dev/null
+++ b/libc/test/src/math/fmaxf_test.cpp
@@ -0,0 +1,73 @@
+//===-- Unittests for fmaxf -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/fmaxf.h"
+#include "utils/FPUtil/FPBits.h"
+#include "utils/UnitTest/Test.h"
+
+using FPBits = __llvm_libc::fputil::FPBits<float>;
+
+float nan = FPBits::buildNaN(1);
+float inf = FPBits::inf();
+float negInf = FPBits::negInf();
+
+TEST(FmaxfTest, NaNArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(nan, inf));
+  EXPECT_EQ(negInf, __llvm_libc::fmaxf(negInf, nan));
+  EXPECT_EQ(0.0f, __llvm_libc::fmaxf(nan, 0.0f));
+  EXPECT_EQ(-0.0f, __llvm_libc::fmaxf(-0.0f, nan));
+  EXPECT_EQ(-1.2345f, __llvm_libc::fmaxf(nan, -1.2345f));
+  EXPECT_EQ(1.2345f, __llvm_libc::fmaxf(1.2345f, nan));
+  EXPECT_NE(isnan(__llvm_libc::fmaxf(nan, nan)), 0);
+}
+
+TEST(FmaxfTest, InfArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(negInf, inf));
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(inf, 0.0f));
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(-0.0f, inf));
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(inf, 1.2345f));
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(-1.2345f, inf));
+}
+
+TEST(FmaxfTest, NegInfArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmaxf(inf, negInf));
+  EXPECT_EQ(0.0f, __llvm_libc::fmaxf(negInf, 0.0f));
+  EXPECT_EQ(-0.0f, __llvm_libc::fmaxf(-0.0f, negInf));
+  EXPECT_EQ(-1.2345f, __llvm_libc::fmaxf(negInf, -1.2345f));
+  EXPECT_EQ(1.2345f, __llvm_libc::fmaxf(1.2345f, negInf));
+}
+
+TEST(FmaxfTest, BothZero) {
+  EXPECT_EQ(0.0f, __llvm_libc::fmaxf(0.0f, 0.0f));
+  EXPECT_EQ(0.0f, __llvm_libc::fmaxf(-0.0f, 0.0f));
+  EXPECT_EQ(0.0f, __llvm_libc::fmaxf(0.0f, -0.0f));
+  EXPECT_EQ(-0.0f, __llvm_libc::fmaxf(-0.0f, -0.0f));
+}
+
+TEST(FmaxfTest, InFloatRange) {
+  using UIntType = FPBits::UIntType;
+  constexpr UIntType count = 10000001;
+  constexpr UIntType step = UIntType(-1) / count;
+  for (UIntType i = 0, v = 0, w = UIntType(-1); i <= count;
+       ++i, v += step, w -= step) {
+    float x = FPBits(v), y = FPBits(w);
+    if (isnan(x) || isinf(x))
+      continue;
+    if (isnan(y) || isinf(y))
+      continue;
+    if ((x == 0) && (y == 0))
+      continue;
+
+    if (x > y) {
+      ASSERT_EQ(x, __llvm_libc::fmaxf(x, y));
+    } else {
+      ASSERT_EQ(y, __llvm_libc::fmaxf(x, y));
+    }
+  }
+}
diff --git a/libc/test/src/math/fmaxl_test.cpp b/libc/test/src/math/fmaxl_test.cpp
new file mode 100644
index 00000000000000..9c7aa21582ebc7
--- /dev/null
+++ b/libc/test/src/math/fmaxl_test.cpp
@@ -0,0 +1,73 @@
+//===-- Unittests for fmaxl -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "include/math.h"
+#include "src/math/fmaxl.h"
+#include "utils/FPUtil/FPBits.h"
+#include "utils/UnitTest/Test.h"
+
+using FPBits = __llvm_libc::fputil::FPBits<long double>;
+
+long double nan = FPBits::buildNaN(1);
+long double inf = FPBits::inf();
+long double negInf = FPBits::negInf();
+
+TEST(FmaxlTest, NaNArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(nan, inf));
+  EXPECT_EQ(negInf, __llvm_libc::fmaxl(negInf, nan));
+  EXPECT_EQ(0.0L, __llvm_libc::fmaxl(nan, 0.0L));
+  EXPECT_EQ(-0.0L, __llvm_libc::fmaxl(-0.0L, nan));
+  EXPECT_EQ(-1.2345L, __llvm_libc::fmaxl(nan, -1.2345L));
+  EXPECT_EQ(1.2345L, __llvm_libc::fmaxl(1.2345L, nan));
+  EXPECT_NE(isnan(__llvm_libc::fmaxl(nan, nan)), 0);
+}
+
+TEST(FmaxlTest, InfArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(negInf, inf));
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(inf, 0.0L));
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(-0.0L, inf));
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(inf, 1.2345L));
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(-1.2345L, inf));
+}
+
+TEST(FmaxlTest, NegInfArg) {
+  EXPECT_EQ(inf, __llvm_libc::fmaxl(inf, negInf));
+  EXPECT_EQ(0.0L, __llvm_libc::fmaxl(negInf, 0.0L));
+  EXPECT_EQ(-0.0L, __llvm_libc::fmaxl(-0.0L, negInf));
+  EXPECT_EQ(-1.2345L, __llvm_libc::fmaxl(negInf, -1.2345L));
+  EXPECT_EQ(1.2345L, __llvm_libc::fmaxl(1.2345L, negInf));
+}
+
+TEST(FmaxlTest, BothZero) {
+  EXPECT_EQ(0.0L, __llvm_libc::fmaxl(0.0L, 0.0L));
+  EXPECT_EQ(0.0L, __llvm_libc::fmaxl(-0.0L, 0.0L));
+  EXPECT_EQ(0.0L, __llvm_libc::fmaxl(0.0L, -0.0L));
+  EXPECT_EQ(-0.0L, __llvm_libc::fmaxl(-0.0L, -0.0L));
+}
+
+TEST(FmaxlTest, InLongDoubleRange) {
+  using UIntType = FPBits::UIntType;
+  constexpr UIntType count = 10000001;
+  constexpr UIntType step = UIntType(-1) / count;
+  for (UIntType i = 0, v = 0, w = UIntType(-1); i <= count;
+       ++i, v += step, w -= step) {
+    long double x = FPBits(v), y = FPBits(w);
+    if (isnan(x) || isinf(x))
+      continue;
+    if (isnan(y) || isinf(y))
+      continue;
+    if ((x == 0) && (y == 0))
+      continue;
+
+    if (x > y) {
+      ASSERT_EQ(x, __llvm_libc::fmaxl(x, y));
+    } else {
+      ASSERT_EQ(y, __llvm_libc::fmaxl(x, y));
+    }
+  }
+}
diff --git a/libc/test/src/math/fmin_test.cpp b/libc/test/src/math/fmin_test.cpp
index c1467e4cbb9007..7fd1e8af1e510e 100644
--- a/libc/test/src/math/fmin_test.cpp
+++ b/libc/test/src/math/fmin_test.cpp
@@ -6,8 +6,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include <vector>
-
 #include "include/math.h"
 #include "src/math/fmin.h"
 #include "utils/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/fminf_test.cpp b/libc/test/src/math/fminf_test.cpp
index 8d03d92f723701..dea4e7ccf52462 100644
--- a/libc/test/src/math/fminf_test.cpp
+++ b/libc/test/src/math/fminf_test.cpp
@@ -6,8 +6,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include <vector>
-
 #include "include/math.h"
 #include "src/math/fminf.h"
 #include "utils/FPUtil/FPBits.h"
diff --git a/libc/test/src/math/fminl_test.cpp b/libc/test/src/math/fminl_test.cpp
index 519c1f33fba6e6..3eabb55b0cab23 100644
--- a/libc/test/src/math/fminl_test.cpp
+++ b/libc/test/src/math/fminl_test.cpp
@@ -6,8 +6,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include <vector>
-
 #include "include/math.h"
 #include "src/math/fminl.h"
 #include "utils/FPUtil/FPBits.h"
diff --git a/libc/utils/FPUtil/BasicOperations.h b/libc/utils/FPUtil/BasicOperations.h
index 2f86ddf5678cfd..78856926af4bfb 100644
--- a/libc/utils/FPUtil/BasicOperations.h
+++ b/libc/utils/FPUtil/BasicOperations.h
@@ -43,6 +43,25 @@ static inline T fmin(T x, T y) {
   }
 }
 
+template <typename T,
+          cpp::EnableIfType<cpp::IsFloatingPointType<T>::Value, int> = 0>
+static inline T fmax(T x, T y) {
+  FPBits<T> bitx(x), bity(y);
+
+  if (bitx.isNaN()) {
+    return y;
+  } else if (bity.isNaN()) {
+    return x;
+  } else if (bitx.sign != bity.sign) {
+    // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and
+    // y has different signs and both are not NaNs, we return the number
+    // with positive sign.
+    return (bitx.sign ? y : x);
+  } else {
+    return (x > y ? x : y);
+  }
+}
+
 } // namespace fputil
 } // namespace __llvm_libc
 
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 5fa3b66c99a7f7..fd877f1adaeb49 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -526,6 +526,11 @@ function(cxx_add_basic_build_flags target)
     CXX_STANDARD_REQUIRED YES
     CXX_EXTENSIONS NO)
 
+  # When building the dylib, don't warn for unavailable aligned allocation
+  # functions based on the deployment target -- they are always available
+  # because they are provided by the dylib itself.
+  target_add_compile_flags_if_supported(${target} PRIVATE -faligned-allocation)
+
   # On all systems the system c++ standard library headers need to be excluded.
   # MSVC only has -X, which disables all default includes; including the crt.
   # Thus, we do nothing and hope we don't accidentally include any of the C++
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 271364c5c6ec39..6b9c5c68998198 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12258,14 +12258,172 @@ Example:
         call void @llvm.call.preallocated.teardown(token %cs)
         ret void
 
-Standard C Library Intrinsics
------------------------------
+Standard C/C++ Library Intrinsics
+---------------------------------
 
-LLVM provides intrinsics for a few important standard C library
+LLVM provides intrinsics for a few important standard C/C++ library
 functions. These intrinsics allow source-language front-ends to pass
 information about the alignment of the pointer arguments to the code
 generator, providing opportunity for more efficient code generation.
 
+
+'``llvm.abs.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.abs`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i32 @llvm.abs.i32(i32 <src>, i1 <is_int_min_poison>)
+      declare <4 x i32> @llvm.abs.v4i32(<4 x i32> <src>, i1 <is_int_min_poison>)
+
+Overview:
+"""""""""
+
+The '``llvm.abs``' family of intrinsic functions returns the absolute value
+of an argument.
+
+Arguments:
+""""""""""
+
+The first argument is the value for which the absolute value is to be returned.
+This argument may be of any integer type or a vector with integer element type.
+The return type must match the first argument type.
+
+The second argument must be a constant and is a flag to indicate whether the
+result value of the '``llvm.abs``' intrinsic is a
+:ref:`poison value <poisonvalues>` if the argument is statically or dynamically
+an ``INT_MIN`` value.
+
+Semantics:
+""""""""""
+
+The '``llvm.abs``' intrinsic returns the magnitude (always positive) of the
+argument or each element of a vector argument.". If the argument is ``INT_MIN``,
+then the result is also ``INT_MIN`` if ``is_int_min_poison == 0`` and
+``poison`` otherwise.
+
+
+'``llvm.smax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.smax`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i32 @llvm.smax.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return the larger of ``%a`` and ``%b`` comparing the values as signed integers.
+Vector intrinsics operate on a per-element basis. The larger element of ``%a``
+and ``%b`` at a given index is returned for that index.
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must match the argument type.
+
+
+'``llvm.smin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.smin`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i32 @llvm.smin.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return the smaller of ``%a`` and ``%b`` comparing the values as signed integers.
+Vector intrinsics operate on a per-element basis. The smaller element of ``%a``
+and ``%b`` at a given index is returned for that index.
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must match the argument type.
+
+
+'``llvm.umax.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.umax`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i32 @llvm.umax.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return the larger of ``%a`` and ``%b`` comparing the values as unsigned
+integers. Vector intrinsics operate on a per-element basis. The larger element
+of ``%a`` and ``%b`` at a given index is returned for that index.
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must match the argument type.
+
+
+'``llvm.umin.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``@llvm.umin`` on any
+integer bit width or any vector of integer elements.
+
+::
+
+      declare i32 @llvm.umin.i32(i32 %a, i32 %b)
+      declare <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+Return the smaller of ``%a`` and ``%b`` comparing the values as unsigned
+integers. Vector intrinsics operate on a per-element basis. The smaller element
+of ``%a`` and ``%b`` at a given index is returned for that index.
+
+Arguments:
+""""""""""
+
+The arguments (``%a`` and ``%b``) may be of any integer type or a vector with
+integer element type. The argument types must match each other, and the return
+type must match the argument type.
+
+
 .. _int_memcpy:
 
 '``llvm.memcpy``' Intrinsic
diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
index 0163e69ac9dd46..a39c4e5413d8d2 100644
--- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
@@ -1,44 +1,85 @@
-//=- FunctionPropertiesAnalysis.h - Function Properties extraction  -*- C++ -=//
+//=- FunctionPropertiesAnalysis.h - Function Properties Analysis --*- C++ -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// This file defines the FunctionPropertiesInfo and FunctionPropertiesAnalysis
+// classes used to extract function properties.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_FUNCTIONPROPERTIESANALYSIS_H_
 #define LLVM_FUNCTIONPROPERTIESANALYSIS_H_
 
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/PassManager.h"
 
 namespace llvm {
 class Function;
 
+class FunctionPropertiesInfo {
+public:
+  static FunctionPropertiesInfo getFunctionPropertiesInfo(const Function &F,
+                                                          const LoopInfo &LI);
+
+  void print(raw_ostream &OS) const;
+
+  /// Number of basic blocks
+  int64_t BasicBlockCount = 0;
+
+  /// Number of blocks reached from a conditional instruction, or that are
+  /// 'cases' of a SwitchInstr.
+  // FIXME: We may want to replace this with a more meaningful metric, like
+  // number of conditionally executed blocks:
+  // 'if (a) s();' would be counted here as 2 blocks, just like
+  // 'if (a) s(); else s2(); s3();' would.
+  int64_t BlocksReachedFromConditionalInstruction = 0;
+
+  /// Number of uses of this function, plus 1 if the function is callable
+  /// outside the module.
+  int64_t Uses = 0;
+
+  /// Number of direct calls made from this function to other functions
+  /// defined in this module.
+  int64_t DirectCallsToDefinedFunctions = 0;
+
+  // Load Instruction Count
+  int64_t LoadInstCount = 0;
+
+  // Store Instruction Count
+  int64_t StoreInstCount = 0;
+
+  // Maximum Loop Depth in the Function
+  int64_t MaxLoopDepth = 0;
+
+  // Number of Top Level Loops in the Function
+  int64_t TopLevelLoopCount = 0;
+};
+
+// Analysis pass
 class FunctionPropertiesAnalysis
     : public AnalysisInfoMixin<FunctionPropertiesAnalysis> {
+
 public:
   static AnalysisKey Key;
-  struct Result {
-    /// Number of basic blocks
-    int64_t BasicBlockCount = 0;
-
-    /// Number of blocks reached from a conditional instruction, or that are
-    /// 'cases' of a SwitchInstr.
-    // FIXME: We may want to replace this with a more meaningful metric, like
-    // number of conditionally executed blocks:
-    // 'if (a) s();' would be counted here as 2 blocks, just like
-    // 'if (a) s(); else s2(); s3();' would.
-    int64_t BlocksReachedFromConditionalInstruction = 0;
-
-    /// Number of uses of this function, plus 1 if the function is callable
-    /// outside the module.
-    int64_t Uses = 0;
-
-    /// Number of direct calls made from this function to other functions
-    /// defined in this module.
-    int64_t DirectCallsToDefinedFunctions = 0;
-  };
-  Result run(const Function &F, FunctionAnalysisManager &FAM);
+
+  using Result = FunctionPropertiesInfo;
+
+  Result run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+/// Printer pass for the FunctionPropertiesAnalysis results.
+class FunctionPropertiesPrinterPass
+    : public PassInfoMixin<FunctionPropertiesPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit FunctionPropertiesPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 43a8cb2a1d51c8..e694e7ad2c8348 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -107,6 +107,9 @@ class CombinerHelper {
   bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
   void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
 
+  bool matchSextTruncSextLoad(MachineInstr &MI);
+  bool applySextTruncSextLoad(MachineInstr &MI);
+
   bool matchElideBrByInvertingCond(MachineInstr &MI);
   void applyElideBrByInvertingCond(MachineInstr &MI);
   bool tryElideBrByInvertingCond(MachineInstr &MI);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4918ea876df65e..0a071464804ebe 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1039,6 +1039,25 @@ def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty],
                                  [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty],
                                  [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
+//===------------------ Integer Min/Max/Abs Intrinsics --------------------===//
+//
+def int_abs : Intrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
+def int_smax : Intrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_smin : Intrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_umax : Intrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+def int_umin : Intrinsic<
+    [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+    [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
+
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index eeb2761faeb9f1..40ed6be089ac8d 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -125,6 +125,12 @@ def extending_loads : GICombineRule<
   (apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>;
 def combines_for_extload: GICombineGroup<[extending_loads]>;
 
+def sext_trunc_sextload : GICombineRule<
+  (defs root:$d),
+  (match (wip_match_opcode G_SEXT_INREG):$d,
+         [{ return Helper.matchSextTruncSextLoad(*${d}); }]),
+  (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>;
+
 def combine_indexed_load_store : GICombineRule<
   (defs root:$root, indexed_load_store_matchdata:$matchinfo),
   (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root,
diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
index a0fc017568f2d6..3db108c94985a4 100644
--- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- FunctionPropertiesAnalysis.cpp - Function properties extraction ----===//
+//===- FunctionPropertiesAnalysis.cpp - Function Properties Analysis ------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements an analysis extracting function features, which may be
-// used by ML-driven policies, for example.
+// This file defines the FunctionPropertiesInfo and FunctionPropertiesAnalysis
+// classes used to extract function properties.
 //
 //===----------------------------------------------------------------------===//
 
@@ -16,27 +16,75 @@
 
 using namespace llvm;
 
-AnalysisKey FunctionPropertiesAnalysis::Key;
+FunctionPropertiesInfo
+FunctionPropertiesInfo::getFunctionPropertiesInfo(const Function &F,
+                                                  const LoopInfo &LI) {
+
+  FunctionPropertiesInfo FPI;
+
+  FPI.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
 
-FunctionPropertiesAnalysis::Result
-FunctionPropertiesAnalysis::run(const Function &F,
-                                FunctionAnalysisManager &FAM) {
-  Result Ret;
-  Ret.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
   for (const auto &BB : F) {
-    ++Ret.BasicBlockCount;
+    ++FPI.BasicBlockCount;
+
     if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
       if (BI->isConditional())
-        Ret.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
-    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator()))
-      Ret.BlocksReachedFromConditionalInstruction +=
+        FPI.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
+    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+      FPI.BlocksReachedFromConditionalInstruction +=
           (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
-    for (const auto &I : BB)
+    }
+
+    for (const auto &I : BB) {
       if (auto *CS = dyn_cast<CallBase>(&I)) {
         const auto *Callee = CS->getCalledFunction();
         if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
-          ++Ret.DirectCallsToDefinedFunctions;
+          ++FPI.DirectCallsToDefinedFunctions;
+      }
+      if (I.getOpcode() == Instruction::Load) {
+        ++FPI.LoadInstCount;
+      } else if (I.getOpcode() == Instruction::Store) {
+        ++FPI.StoreInstCount;
       }
+    }
+    // Loop Depth of the Basic Block
+    int64_t LoopDepth;
+    LoopDepth = LI.getLoopDepth(&BB);
+    if (FPI.MaxLoopDepth < LoopDepth)
+      FPI.MaxLoopDepth = LoopDepth;
   }
-  return Ret;
+  for (Loop *L : LI) {
+    ++FPI.TopLevelLoopCount;
+  }
+  return FPI;
+}
+
+void FunctionPropertiesInfo::print(raw_ostream &OS) const {
+  OS << "BasicBlockCount: " << BasicBlockCount << "\n"
+     << "BlocksReachedFromConditionalInstruction: "
+     << BlocksReachedFromConditionalInstruction << "\n"
+     << "Uses: " << Uses << "\n"
+     << "DirectCallsToDefinedFunctions: " << DirectCallsToDefinedFunctions
+     << "\n"
+     << "LoadInstCount: " << LoadInstCount << "\n"
+     << "StoreInstCount: " << StoreInstCount << "\n"
+     << "MaxLoopDepth: " << MaxLoopDepth << "\n"
+     << "TopLevelLoopCount: " << TopLevelLoopCount << "\n\n";
+}
+
+AnalysisKey FunctionPropertiesAnalysis::Key;
+
+FunctionPropertiesInfo
+FunctionPropertiesAnalysis::run(Function &F, FunctionAnalysisManager &FAM) {
+  return FunctionPropertiesInfo::getFunctionPropertiesInfo(
+      F, FAM.getResult<LoopAnalysis>(F));
+}
+
+PreservedAnalyses
+FunctionPropertiesPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
+  OS << "Printing analysis results of CFA for function "
+     << "'" << F.getName() << "':"
+     << "\n";
+  AM.getResult<FunctionPropertiesAnalysis>(F).print(OS);
+  return PreservedAnalyses::all();
 }
\ No newline at end of file
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 32bad28d318ba9..eba1a522d4132e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -576,6 +576,40 @@ bool CombinerHelper::dominates(const MachineInstr &DefMI,
   return isPredecessor(DefMI, UseMI);
 }
 
+bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register LoadUser = SrcReg;
+
+  if (MRI.getType(SrcReg).isVector())
+    return false;
+
+  Register TruncSrc;
+  if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc))))
+    LoadUser = TruncSrc;
+
+  uint64_t SizeInBits = MI.getOperand(2).getImm();
+  // If the source is a G_SEXTLOAD from the same bit width, then we don't
+  // need any extend at all, just a truncate.
+  if (auto *LoadMI = getOpcodeDef(TargetOpcode::G_SEXTLOAD, LoadUser, MRI)) {
+    const auto &MMO = **LoadMI->memoperands_begin();
+    // If truncating more than the original extended value, abort.
+    if (TruncSrc && MRI.getType(TruncSrc).getSizeInBits() < MMO.getSizeInBits())
+      return false;
+    if (MMO.getSizeInBits() == SizeInBits)
+      return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  MachineIRBuilder MIB(MI);
+  MIB.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
                                             Register &Base, Register &Offset) {
   auto &MF = *MI.getParent()->getParent();
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index a5706958b39fa2..aa898d5a618963 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1486,6 +1486,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateBinaryOp(TargetOpcode::G_USUBSAT, CI, MIRBuilder);
   case Intrinsic::ssub_sat:
     return translateBinaryOp(TargetOpcode::G_SSUBSAT, CI, MIRBuilder);
+  case Intrinsic::umin:
+    return translateBinaryOp(TargetOpcode::G_UMIN, CI, MIRBuilder);
+  case Intrinsic::umax:
+    return translateBinaryOp(TargetOpcode::G_UMAX, CI, MIRBuilder);
+  case Intrinsic::smin:
+    return translateBinaryOp(TargetOpcode::G_SMIN, CI, MIRBuilder);
+  case Intrinsic::smax:
+    return translateBinaryOp(TargetOpcode::G_SMAX, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
     const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8dadb2833e8d9a..9d2f64b94df166 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6363,6 +6363,36 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                               Op1, Op2, Op3, DAG, TLI));
     return;
   }
+  case Intrinsic::smax: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SMAX, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::smin: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::SMIN, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::umax: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UMAX, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::umin: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    setValue(&I, DAG.getNode(ISD::UMIN, sdl, Op1.getValueType(), Op1, Op2));
+    return;
+  }
+  case Intrinsic::abs: {
+    // TODO: Preserve "int min is poison" arg in SDAG?
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    setValue(&I, DAG.getNode(ISD::ABS, sdl, Op1.getValueType(), Op1));
+    return;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index f323d37ca46adf..edaca9ebf60901 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -129,10 +129,10 @@ FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis())
 FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
 FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
+FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("da", DependenceAnalysis())
-FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
 FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
 FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis())
@@ -234,6 +234,7 @@ FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
 FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(dbgs()))
+FUNCTION_PASS("print<func-properties>", FunctionPropertiesPrinterPass(dbgs()))
 FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(dbgs()))
 FUNCTION_PASS("print<inliner-size-estimator>", 
   InlineSizeEstimatorAnalysisPrinterPass(dbgs()))
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 1e39db5a984a6f..b8da73de0f8c81 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -79,6 +79,6 @@ def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>;
 def AArch64PostLegalizerCombinerHelper
     : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
                        [erase_undef_store, combines_for_extload,
-                        shuffle_vector_pseudos]> {
+                        sext_trunc_sextload, shuffle_vector_pseudos]> {
   let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 791b10f4917177..a72d100b130754 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -1155,12 +1155,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       });
   }
 
-  SmallVector<ArgInfo, 8> InArgs;
-  if (!Info.OrigRet.Ty->isVoidTy()) {
-    LLVM_DEBUG(dbgs() << "Call return values not yet handled\n");
-    return false;
-  }
-
   // If we can lower as a tail call, do that instead.
   bool CanTailCallOpt = false;
 
@@ -1232,9 +1226,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
-  // Now we can add the actual call instruction to the correct position.
-  MIRBuilder.insertInstr(MIB);
-
   // If Callee is a reg, since it is used by a target specific
   // instruction, it must have a register class matching the
   // constraint of that instruction.
@@ -1248,6 +1239,31 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
         1));
   }
 
+  auto OrigInsertPt = MIRBuilder.getInsertPt();
+
+  // Now we can add the actual call instruction to the correct position.
+  MIRBuilder.insertInstr(MIB);
+
+  // Insert this now to give us an anchor point for managing the insert point.
+  MachineInstrBuilder CallSeqEnd =
+    MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN);
+
+  SmallVector<ArgInfo, 8> InArgs;
+  if (!Info.OrigRet.Ty->isVoidTy()) {
+    splitToValueTypes(
+      MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv, false,
+      [&](ArrayRef<Register> Regs, Register DstReg,
+          LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+        assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]);
+        packSplitRegsToOrigType(MIRBuilder,  Info.OrigRet.Regs[VTSplitIdx],
+                                Regs, LLTy, PartLLT);
+      });
+  }
+
+  // Make sure the raw argument copies are inserted before the marshalling to
+  // the original types.
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd);
+
   // Finally we can copy the returned value back into its virtual-register. In
   // symmetry with the arguments, the physical register must be an
   // implicit-define of the call instruction.
@@ -1260,9 +1276,10 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   }
 
   uint64_t CalleePopBytes = NumBytes;
-  MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN)
-    .addImm(0)
-    .addImm(CalleePopBytes);
+  CallSeqEnd.addImm(0)
+            .addImm(CalleePopBytes);
 
+  // Restore the insert point to after the call sequence.
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt);
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 0986e1efb98406..ea1d20f7387dfc 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -35,7 +35,7 @@ struct FoldCandidate {
     int FrameIndexToFold;
   };
   int ShrinkOpcode;
-  unsigned char UseOpNo;
+  unsigned UseOpNo;
   MachineOperand::MachineOperandType Kind;
   bool Commuted;
 
@@ -662,6 +662,11 @@ void SIFoldOperands::foldOperand(
           Use = MRI->use_begin(DestReg), E = MRI->use_end();
           Use != E; Use = NextUse) {
           NextUse = std::next(Use);
+
+          // There's no point trying to fold into an implicit operand.
+          if (Use->isImplicit())
+            continue;
+
           FoldCandidate FC = FoldCandidate(Use->getParent(),
            Use.getOperandNo(), &UseMI->getOperand(1));
           CopyUses.push_back(FC);
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 32dc14e5ec1952..985e88dfa017f1 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -276,7 +276,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
 
   // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
   // block to the 'To' basic block is currently feasible.
-  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const;
 
   std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
     std::vector<ValueLatticeElement> StructValues;
@@ -705,7 +705,7 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
 
 // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
 // block to the 'To' basic block is currently feasible.
-bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
   // Check if we've called markEdgeExecutable on the edge yet. (We could
   // be more aggressive and try to consider edges which haven't been marked
   // yet, but there isn't any need.)
@@ -1807,39 +1807,51 @@ static void findReturnsToZap(Function &F,
   }
 }
 
-// Update the condition for terminators that are branching on indeterminate
-// values, forcing them to use a specific edge.
-static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) {
-  BasicBlock *Dest = nullptr;
-  Constant *C = nullptr;
-  if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
-    if (!isa<ConstantInt>(SI->getCondition())) {
-      // Indeterminate switch; use first case value.
-      Dest = SI->case_begin()->getCaseSuccessor();
-      C = SI->case_begin()->getCaseValue();
-    }
-  } else if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!isa<ConstantInt>(BI->getCondition())) {
-      // Indeterminate branch; use false.
-      Dest = BI->getSuccessor(1);
-      C = ConstantInt::getFalse(BI->getContext());
-    }
-  } else if (IndirectBrInst *IBR = dyn_cast<IndirectBrInst>(I)) {
-    if (!isa<BlockAddress>(IBR->getAddress()->stripPointerCasts())) {
-      // Indeterminate indirectbr; use successor 0.
-      Dest = IBR->getSuccessor(0);
-      C = BlockAddress::get(IBR->getSuccessor(0));
-    }
-  } else {
-    llvm_unreachable("Unexpected terminator instruction");
+static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
+                                   DomTreeUpdater &DTU) {
+  SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
+  bool HasNonFeasibleEdges = false;
+  for (BasicBlock *Succ : successors(BB)) {
+    if (Solver.isEdgeFeasible(BB, Succ))
+      FeasibleSuccessors.insert(Succ);
+    else
+      HasNonFeasibleEdges = true;
   }
-  if (C) {
-    assert(Solver.isEdgeFeasible(I->getParent(), Dest) &&
-           "Didn't find feasible edge?");
-    (void)Dest;
 
-    I->setOperand(0, C);
+  // All edges feasible, nothing to do.
+  if (!HasNonFeasibleEdges)
+    return false;
+
+  // SCCP can only determine non-feasible edges for br, switch and indirectbr.
+  Instruction *TI = BB->getTerminator();
+  assert((isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+          isa<IndirectBrInst>(TI)) &&
+         "Terminator must be a br, switch or indirectbr");
+
+  if (FeasibleSuccessors.size() == 1) {
+    // Replace with an unconditional branch to the only feasible successor.
+    BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin();
+    SmallVector<DominatorTree::UpdateType, 8> Updates;
+    bool HaveSeenOnlyFeasibleSuccessor = false;
+    for (BasicBlock *Succ : successors(BB)) {
+      if (Succ == OnlyFeasibleSuccessor && !HaveSeenOnlyFeasibleSuccessor) {
+        // Don't remove the edge to the only feasible successor the first time
+        // we see it. We still do need to remove any multi-edges to it though.
+        HaveSeenOnlyFeasibleSuccessor = true;
+        continue;
+      }
+
+      Succ->removePredecessor(BB);
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
+    }
+
+    DTU.applyUpdatesPermissive(Updates);
+    BranchInst::Create(OnlyFeasibleSuccessor, BB);
+    TI->eraseFromParent();
+  } else {
+    llvm_unreachable("Either all successors are feasible, or exactly one is");
   }
+  return true;
 }
 
 bool llvm::runIPSCCP(
@@ -1952,45 +1964,11 @@ bool llvm::runIPSCCP(
                                             /*UseLLVMTrap=*/false,
                                             /*PreserveLCSSA=*/false, &DTU);
 
-    // Now that all instructions in the function are constant folded,
-    // use ConstantFoldTerminator to get rid of in-edges, record DT updates and
-    // delete dead BBs.
-    for (BasicBlock *DeadBB : BlocksToErase) {
-      // If there are any PHI nodes in this successor, drop entries for BB now.
-      for (Value::user_iterator UI = DeadBB->user_begin(),
-                                UE = DeadBB->user_end();
-           UI != UE;) {
-        // Grab the user and then increment the iterator early, as the user
-        // will be deleted. Step past all adjacent uses from the same user.
-        auto *I = dyn_cast<Instruction>(*UI);
-        do { ++UI; } while (UI != UE && *UI == I);
-
-        // Ignore blockaddress users; BasicBlock's dtor will handle them.
-        if (!I) continue;
-
-        // If we have forced an edge for an indeterminate value, then force the
-        // terminator to fold to that edge.
-        forceIndeterminateEdge(I, Solver);
-        BasicBlock *InstBB = I->getParent();
-        bool Folded = ConstantFoldTerminator(InstBB,
-                                             /*DeleteDeadConditions=*/false,
-                                             /*TLI=*/nullptr, &DTU);
-        assert(Folded &&
-              "Expect TermInst on constantint or blockaddress to be folded");
-        (void) Folded;
-        // If we folded the terminator to an unconditional branch to another
-        // dead block, replace it with Unreachable, to avoid trying to fold that
-        // branch again.
-        BranchInst *BI = cast<BranchInst>(InstBB->getTerminator());
-        if (BI && BI->isUnconditional() &&
-            !Solver.isBlockExecutable(BI->getSuccessor(0))) {
-          InstBB->getTerminator()->eraseFromParent();
-          new UnreachableInst(InstBB->getContext(), InstBB);
-        }
-      }
-      // Mark dead BB for deletion.
+    for (BasicBlock &BB : F)
+      MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
+
+    for (BasicBlock *DeadBB : BlocksToErase)
       DTU.deleteBB(DeadBB);
-    }
 
     for (BasicBlock &BB : F) {
       for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
diff --git a/llvm/test/Analysis/FunctionPropertiesAnalysis/matmul.ll b/llvm/test/Analysis/FunctionPropertiesAnalysis/matmul.ll
new file mode 100644
index 00000000000000..506914972a1042
--- /dev/null
+++ b/llvm/test/Analysis/FunctionPropertiesAnalysis/matmul.ll
@@ -0,0 +1,140 @@
+; RUN: opt < %s -passes='print<func-properties>' -disable-output 2>&1 | FileCheck %s
+
+define i32 @main() {
+; CHECK-DAG: Printing analysis results of CFA for function 'main':
+
+entry:
+  %retval = alloca i32, align 4
+  %mat1 = alloca [2 x [2 x i32]], align 16
+  %mat2 = alloca [2 x [2 x i32]], align 16
+  %res = alloca [2 x [2 x i32]], align 16
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %arraydecay = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat1, i64 0, i64 0
+  %arraydecay1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat2, i64 0, i64 0
+  %arraydecay2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %res, i64 0, i64 0
+  call void @multiply([2 x i32]* %arraydecay, [2 x i32]* %arraydecay1, [2 x i32]* %arraydecay2)
+  ret i32 0
+}
+; CHECK-DAG: BasicBlockCount: 1
+; CHECK-DAG: BlocksReachedFromConditionalInstruction: 0
+; CHECK-DAG: Uses: 1
+; CHECK-DAG: DirectCallsToDefinedFunctions: 1
+; CHECK-DAG: LoadInstCount: 0
+; CHECK-DAG: StoreInstCount: 1
+; CHECK-DAG: MaxLoopDepth: 0
+; CHECK-DAG: TopLevelLoopCount: 0
+
+define void @multiply([2 x i32]* %mat1, [2 x i32]* %mat2, [2 x i32]* %res) {
+; CHECK-DAG: Printing analysis results of CFA for function 'multiply':
+entry:
+  %mat1.addr = alloca [2 x i32]*, align 8
+  %mat2.addr = alloca [2 x i32]*, align 8
+  %res.addr = alloca [2 x i32]*, align 8
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  %k = alloca i32, align 4
+  store [2 x i32]* %mat1, [2 x i32]** %mat1.addr, align 8
+  store [2 x i32]* %mat2, [2 x i32]** %mat2.addr, align 8
+  store [2 x i32]* %res, [2 x i32]** %res.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc24, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 2
+  br i1 %cmp, label %for.body, label %for.end26
+
+for.body:                                         ; preds = %for.cond
+  store i32 0, i32* %j, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc21, %for.body
+  %1 = load i32, i32* %j, align 4
+  %cmp2 = icmp slt i32 %1, 2
+  br i1 %cmp2, label %for.body3, label %for.end23
+
+for.body3:                                        ; preds = %for.cond1
+  %2 = load [2 x i32]*, [2 x i32]** %res.addr, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %2, i64 %idxprom
+  %4 = load i32, i32* %j, align 4
+  %idxprom4 = sext i32 %4 to i64
+  %arrayidx5 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx, i64 0, i64 %idxprom4
+  store i32 0, i32* %arrayidx5, align 4
+  store i32 0, i32* %k, align 4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc, %for.body3
+  %5 = load i32, i32* %k, align 4
+  %cmp7 = icmp slt i32 %5, 2
+  br i1 %cmp7, label %for.body8, label %for.end
+
+for.body8:                                        ; preds = %for.cond6
+  %6 = load [2 x i32]*, [2 x i32]** %mat1.addr, align 8
+  %7 = load i32, i32* %i, align 4
+  %idxprom9 = sext i32 %7 to i64
+  %arrayidx10 = getelementptr inbounds [2 x i32], [2 x i32]* %6, i64 %idxprom9
+  %8 = load i32, i32* %k, align 4
+  %idxprom11 = sext i32 %8 to i64
+  %arrayidx12 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx10, i64 0, i64 %idxprom11
+  %9 = load i32, i32* %arrayidx12, align 4
+  %10 = load [2 x i32]*, [2 x i32]** %mat2.addr, align 8
+  %11 = load i32, i32* %k, align 4
+  %idxprom13 = sext i32 %11 to i64
+  %arrayidx14 = getelementptr inbounds [2 x i32], [2 x i32]* %10, i64 %idxprom13
+  %12 = load i32, i32* %j, align 4
+  %idxprom15 = sext i32 %12 to i64
+  %arrayidx16 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx14, i64 0, i64 %idxprom15
+  %13 = load i32, i32* %arrayidx16, align 4
+  %mul = mul nsw i32 %9, %13
+  %14 = load [2 x i32]*, [2 x i32]** %res.addr, align 8
+  %15 = load i32, i32* %i, align 4
+  %idxprom17 = sext i32 %15 to i64
+  %arrayidx18 = getelementptr inbounds [2 x i32], [2 x i32]* %14, i64 %idxprom17
+  %16 = load i32, i32* %j, align 4
+  %idxprom19 = sext i32 %16 to i64
+  %arrayidx20 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx18, i64 0, i64 %idxprom19
+  %17 = load i32, i32* %arrayidx20, align 4
+  %add = add nsw i32 %17, %mul
+  store i32 %add, i32* %arrayidx20, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body8
+  %18 = load i32, i32* %k, align 4
+  %inc = add nsw i32 %18, 1
+  store i32 %inc, i32* %k, align 4
+  br label %for.cond6
+
+for.end:                                          ; preds = %for.cond6
+  br label %for.inc21
+
+for.inc21:                                        ; preds = %for.end
+  %19 = load i32, i32* %j, align 4
+  %inc22 = add nsw i32 %19, 1
+  store i32 %inc22, i32* %j, align 4
+  br label %for.cond1
+
+for.end23:                                        ; preds = %for.cond1
+  br label %for.inc24
+
+for.inc24:                                        ; preds = %for.end23
+  %20 = load i32, i32* %i, align 4
+  %inc25 = add nsw i32 %20, 1
+  store i32 %inc25, i32* %i, align 4
+  br label %for.cond
+
+for.end26:                                        ; preds = %for.cond
+  ret void
+}
+
+; CHECK-DAG: BasicBlockCount: 13
+; CHECK-DAG: BlocksReachedFromConditionalInstruction: 6
+; CHECK-DAG: Uses: 2
+; CHECK-DAG: DirectCallsToDefinedFunctions: 0
+; CHECK-DAG: LoadInstCount: 21
+; CHECK-DAG: StoreInstCount: 11
+; CHECK-DAG: MaxLoopDepth: 3
+; CHECK-DAG: TopLevelLoopCount: 1
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-trunc-sextload.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-trunc-sextload.mir
new file mode 100644
index 00000000000000..616973c04ac4ad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-trunc-sextload.mir
@@ -0,0 +1,63 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            test_combine_sext_trunc_of_sextload
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_combine_sext_trunc_of_sextload
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s64) = G_SEXTLOAD [[COPY]](p0) :: (load 2)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXTLOAD]](s64)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32)
+    ; CHECK: $w0 = COPY [[COPY1]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s64) = G_SEXTLOAD %0:_(p0) :: (load 2)
+    %2:_(s32) = G_TRUNC %1:_(s64)
+    %3:_(s32) = G_SEXT_INREG %2:_(s32), 16
+    $w0 = COPY %3(s32)
+...
+---
+name:            test_combine_sext_of_sextload
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; CHECK-LABEL: name: test_combine_sext_of_sextload
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load 2)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SEXTLOAD]](s32)
+    ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK: $w0 = COPY [[COPY2]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_SEXTLOAD %0:_(p0) :: (load 2)
+    %2:_(s32) = COPY %1:_(s32)
+    %3:_(s32) = G_SEXT_INREG %2:_(s32), 16
+    $w0 = COPY %3(s32)
+...
+---
+name:            test_combine_sext_of_sextload_not_matching
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x0
+    ; Here we're trying to extend from a smaller value than was extended in the load.
+    ; CHECK-LABEL: name: test_combine_sext_of_sextload_not_matching
+    ; CHECK: liveins: $x0
+    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
+    ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load 2)
+    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SEXTLOAD]](s32)
+    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 24
+    ; CHECK: $w0 = COPY [[SEXT_INREG]](s32)
+    %0:_(p0) = COPY $x0
+    %1:_(s32) = G_SEXTLOAD %0:_(p0) :: (load 2)
+    %2:_(s32) = COPY %1:_(s32)
+    %3:_(s32) = G_SEXT_INREG %2:_(s32), 24
+    $w0 = COPY %3(s32)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
new file mode 100644
index 00000000000000..1136afafbd4aaa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll
@@ -0,0 +1,2509 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i1 @external_i1_func_void() #0
+declare zeroext i1 @external_i1_zeroext_func_void() #0
+declare signext i1 @external_i1_signext_func_void() #0
+
+declare i8 @external_i8_func_void() #0
+declare zeroext i8 @external_i8_zeroext_func_void() #0
+declare signext i8 @external_i8_signext_func_void() #0
+
+declare i16 @external_i16_func_void() #0
+declare <2 x i16> @external_v2i16_func_void() #0
+declare <3 x i16> @external_v3i16_func_void() #0
+declare <4 x i16> @external_v4i16_func_void() #0
+declare zeroext i16 @external_i16_zeroext_func_void() #0
+declare signext i16 @external_i16_signext_func_void() #0
+
+declare i48 @external_i48_func_void() #0
+declare zeroext i48 @external_i48_zeroext_func_void() #0
+declare signext i48 @external_i48_signext_func_void() #0
+
+declare i32 @external_i32_func_void() #0
+declare i64 @external_i64_func_void() #0
+declare half @external_f16_func_void() #0
+declare float @external_f32_func_void() #0
+declare double @external_f64_func_void() #0
+
+declare i8 addrspace(1)* @external_p1_func_void() #0
+declare <2 x i8 addrspace(1)*> @external_v2p1_func_void() #0
+
+declare i8 addrspace(3)* @external_p3_func_void() #0
+declare <2 x i8 addrspace(3)*> @external_v2p3_func_void() #0
+
+declare <2 x half> @external_v2f16_func_void() #0
+declare <3 x half> @external_v3f16_func_void() #0
+declare <4 x half> @external_v4f16_func_void() #0
+declare <3 x float> @external_v3f32_func_void() #0
+declare <5 x float> @external_v5f32_func_void() #0
+declare <2 x double> @external_v2f64_func_void() #0
+
+declare <2 x i32> @external_v2i32_func_void() #0
+declare <3 x i32> @external_v3i32_func_void() #0
+declare <4 x i32> @external_v4i32_func_void() #0
+declare <5 x i32> @external_v5i32_func_void() #0
+declare <8 x i32> @external_v8i32_func_void() #0
+declare <16 x i32> @external_v16i32_func_void() #0
+declare <32 x i32> @external_v32i32_func_void() #0
+declare { <32 x i32>, i32 } @external_v32i32_i32_func_void() #0
+
+declare { i32, i64 } @external_i32_i64_func_void() #0
+
+declare [2 x i32] @external_a2i32_func_void() #0
+declare [5 x i8] @external_a5i8_func_void() #0
+
+; return value and argument
+declare hidden i32 @external_i32_func_i32(i32) #0
+
+
+define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 {
+  ; GCN-LABEL: name: test_call_external_i32_func_i32_imm
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42
+  ; GCN:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
+  ; GCN:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load 8 from %ir.out.kernarg.offset.cast, align 16, addrspace 4)
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i32_func_i32
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   $vgpr0 = COPY [[C]](s32)
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.load, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i32 @external_i32_func_i32(i32 42)
+  store volatile i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i1_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i1_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i1_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i1 @external_i1_func_void()
+  store volatile i1 %val, i1 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i1_zeroext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i1_zeroext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1)
+  ; GCN:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i1 @external_i1_zeroext_func_void()
+  %val.ext = zext i1 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i1_signext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i1_signext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1)
+  ; GCN:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i1 @external_i1_signext_func_void()
+  %val.ext = sext i1 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i8_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i8_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i8_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i8 @external_i8_func_void()
+  store volatile i8 %val, i8 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i8_zeroext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i8_zeroext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i8_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC1]](s8)
+  ; GCN:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i8 @external_i8_zeroext_func_void()
+  %val.ext = zext i8 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i8_signext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i8_signext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i8_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s8)
+  ; GCN:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i8 @external_i8_signext_func_void()
+  %val.ext = sext i8 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i16 @external_i16_func_void()
+  store volatile i16 %val, i16 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i16_zeroext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i16_zeroext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i16_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s16)
+  ; GCN:   G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i16 @external_i16_zeroext_func_void()
+  %val.ext = zext i16 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i16_signext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i16_signext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i16_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16)
+  ; GCN:   G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i16 @external_i16_signext_func_void()
+  %val.ext = sext i16 %val to i32
+  store volatile i32 %val.ext, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i32 @external_i32_func_void()
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i48_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i48_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i48_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; GCN:   G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (volatile store 6 into `i48 addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i48 @external_i48_func_void()
+  store volatile i48 %val, i48 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i48_zeroext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i48_zeroext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; GCN:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48)
+  ; GCN:   G_STORE [[ZEXT]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i48 @external_i48_zeroext_func_void()
+  %ext = zext i48 %val to i64
+  store volatile i64 %ext, i64 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i48_signext_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i48_signext_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64)
+  ; GCN:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48)
+  ; GCN:   G_STORE [[SEXT]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i48 @external_i48_signext_func_void()
+  %ext = sext i48 %val to i64
+  store volatile i64 %ext, i64 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_i64_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i64_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i64_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i64 @external_i64_func_void()
+  store volatile i64 %val, i64 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_p1_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_p1_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_p1_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_p1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   G_STORE [[MV]](p1), [[DEF]](p1) :: (volatile store 8 into `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call i8 addrspace(1)* @external_p1_func_void()
+  store volatile i8 addrspace(1)* %val, i8 addrspace(1)* addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v2p1_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2p1_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2p1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (volatile store 16 into `<2 x i8 addrspace(1)*> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <2 x i8 addrspace(1)*> @external_v2p1_func_void()
+  store volatile <2 x i8 addrspace(1)*> %val, <2 x i8 addrspace(1)*> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_p3_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_p3_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_p3_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_p3_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](p3), [[DEF]](p3) :: (volatile store 4 into `i8 addrspace(3)* addrspace(3)* undef`, addrspace 3)
+  ; GCN:   S_ENDPGM 0
+  %val = call i8 addrspace(3)* @external_p3_func_void()
+  store volatile i8 addrspace(3)* %val, i8 addrspace(3)* addrspace(3)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v2p3_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2p3_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2p3_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(p3) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY21]](p3), [[COPY22]](p3)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p3) :: (volatile store 8 into `<2 x i8 addrspace(3)*> addrspace(3)* undef`, addrspace 3)
+  ; GCN:   S_ENDPGM 0
+  %val = call <2 x i8 addrspace(3)*> @external_v2p3_func_void()
+  store volatile <2 x i8 addrspace(3)*> %val, <2 x i8 addrspace(3)*> addrspace(3)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_f16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_f16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_f16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call half @external_f16_func_void()
+  store volatile half %val, half addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_f32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_f32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_f32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store 4 into `float addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call float @external_f32_func_void()
+  store volatile float %val, float addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_f64_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_f64_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_f64_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store 8 into `double addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call double @external_f64_func_void()
+  store volatile double %val, double addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v2f64_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2f64_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2f64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (volatile store 16 into `<2 x double> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <2 x double> @external_v2f64_func_void()
+  store volatile <2 x double> %val, <2 x double> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v2i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (volatile store 8 into `<2 x i32> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <2 x i32> @external_v2i32_func_void()
+  store volatile <2 x i32> %val, <2 x i32> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v3i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store 12 into `<3 x i32> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <3 x i32> @external_v3i32_func_void()
+  store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v4i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v4i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (volatile store 16 into `<4 x i32> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <4 x i32> @external_v4i32_func_void()
+  store volatile <4 x i32> %val, <4 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v5i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v5i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v5i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store 20 into `<5 x i32> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <5 x i32> @external_v5i32_func_void()
+  store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v8i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v8i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v8i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GCN:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GCN:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (volatile store 32 into `<8 x i32> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <8 x i32> @external_v8i32_func_void()
+  store volatile <8 x i32> %val, <8 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v16i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v16i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v16i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GCN:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GCN:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GCN:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GCN:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9
+  ; GCN:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GCN:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GCN:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12
+  ; GCN:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13
+  ; GCN:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14
+  ; GCN:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (volatile store 64 into `<16 x i32> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <16 x i32> @external_v16i32_func_void()
+  store volatile <16 x i32> %val, <16 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v32i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v32i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v32i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GCN:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GCN:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GCN:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; GCN:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9
+  ; GCN:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; GCN:   [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; GCN:   [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12
+  ; GCN:   [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13
+  ; GCN:   [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14
+  ; GCN:   [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15
+  ; GCN:   [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr16
+  ; GCN:   [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr17
+  ; GCN:   [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr18
+  ; GCN:   [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr19
+  ; GCN:   [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr20
+  ; GCN:   [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr21
+  ; GCN:   [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr22
+  ; GCN:   [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr23
+  ; GCN:   [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr24
+  ; GCN:   [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr25
+  ; GCN:   [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr26
+  ; GCN:   [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr27
+  ; GCN:   [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr28
+  ; GCN:   [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr29
+  ; GCN:   [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr30
+  ; GCN:   [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32), [[COPY51]](s32), [[COPY52]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <32 x i32> @external_v32i32_func_void()
+  store volatile <32 x i32> %val, <32 x i32> addrspace(1)* undef, align 8
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v2i16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2i16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store 4 into `<2 x i16> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <2 x i16> @external_v2i16_func_void()
+  store volatile <2 x i16> %val, <2 x i16> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v3i16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3i16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+  ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>)
+  ; GCN:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; GCN:   G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store 6 into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <3 x i16> @external_v3i16_func_void()
+  store volatile <3 x i16> %val, <3 x i16> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v4i16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v4i16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store 8 into `<4 x i16> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <4 x i16> @external_v4i16_func_void()
+  store volatile <4 x i16> %val, <4 x i16> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v2f16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2f16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store 4 into `<2 x half> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <2 x half> @external_v2f16_func_void()
+  store volatile <2 x half> %val, <2 x half> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v3f16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3f16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF
+  ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>)
+  ; GCN:   [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+  ; GCN:   G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store 6 into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <3 x half> @external_v3f16_func_void()
+  store volatile <3 x half> %val, <3 x half> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v4f16_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v4f16_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>)
+  ; GCN:   G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store 8 into `<4 x half> addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <4 x half> @external_v4f16_func_void()
+  store volatile <4 x half> %val, <4 x half> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v3f32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3f32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3f32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store 12 into `<3 x float> addrspace(1)* undef`, align 16, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <3 x float> @external_v3f32_func_void()
+  store volatile <3 x float> %val, <3 x float> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_v5f32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v5f32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v5f32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32)
+  ; GCN:   G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store 20 into `<5 x float> addrspace(1)* undef`, align 32, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call <5 x float> @external_v5f32_func_void()
+  store volatile <5 x float> %val, <5 x float> addrspace(1)* undef
+  ret void
+}
+
+
+define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_i32_i64_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i32_i64_func_void
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY13]], [[C]](s64)
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY18]], [[SHL]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32)
+  ; GCN:   G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[MV]](s64), [[COPY10]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call { i32, i64 } @external_i32_i64_func_void()
+  %val.0 = extractvalue { i32, i64 } %val, 0
+  %val.1 = extractvalue { i32, i64 } %val, 1
+  store volatile i32 %val.0, i32 addrspace(1)* undef
+  store volatile i64 %val.1, i64 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_a2i32_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_a2i32_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_a2i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call [2 x i32] @external_a2i32_func_void()
+  %val.0 = extractvalue [2 x i32] %val, 0
+  %val.1 = extractvalue [2 x i32] %val, 1
+  store volatile i32 %val.0, i32 addrspace(1)* undef
+  store volatile i32 %val.1, i32 addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @test_call_external_a5i8_func_void() #0 {
+  ; GCN-LABEL: name: test_call_external_a5i8_func_void
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_a5i8_func_void
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64)
+  ; GCN:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]]
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY14]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY16]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_a5i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4
+  ; GCN:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GCN:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32)
+  ; GCN:   [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16)
+  ; GCN:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GCN:   [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY22]](s32)
+  ; GCN:   [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC2]](s16)
+  ; GCN:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GCN:   [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY23]](s32)
+  ; GCN:   [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC4]](s16)
+  ; GCN:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GCN:   [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY24]](s32)
+  ; GCN:   [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC6]](s16)
+  ; GCN:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GCN:   [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY25]](s32)
+  ; GCN:   [[TRUNC9:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC8]](s16)
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[TRUNC3]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[TRUNC5]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[TRUNC7]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[TRUNC9]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %val = call [5 x i8] @external_a5i8_func_void()
+  %val.0 = extractvalue [5 x i8] %val, 0
+  %val.1 = extractvalue [5 x i8] %val, 1
+  %val.2 = extractvalue [5 x i8] %val, 2
+  %val.3 = extractvalue [5 x i8] %val, 3
+  %val.4 = extractvalue [5 x i8] %val, 4
+  store volatile i8 %val.0, i8 addrspace(1)* undef
+  store volatile i8 %val.1, i8 addrspace(1)* undef
+  store volatile i8 %val.2, i8 addrspace(1)* undef
+  store volatile i8 %val.3, i8 addrspace(1)* undef
+  store volatile i8 %val.4, i8 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
new file mode 100644
index 00000000000000..d53cfe688f53cf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0
+
+define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
+  ; GCN-LABEL: name: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
+  ; GCN: bb.1 (%ir-block.1):
+  ; GCN:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; GCN:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; GCN:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; GCN:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; GCN:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; GCN:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; GCN:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; GCN:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; GCN:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; GCN:   [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3
+  ; GCN:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; GCN:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; GCN:   [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; GCN:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.in.val
+  ; GCN:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1.out.val
+  ; GCN:   [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr)
+  ; GCN:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; GCN:   [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32)
+  ; GCN:   G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store 1 into %ir.in.gep01, addrspace 5)
+  ; GCN:   G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store 4 into %ir.in.gep1, addrspace 5)
+  ; GCN:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; GCN:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32
+  ; GCN:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; GCN:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; GCN:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; GCN:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+  ; GCN:   [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY13]], [[C3]](s64)
+  ; GCN:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; GCN:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; GCN:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; GCN:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; GCN:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; GCN:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; GCN:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GCN:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32)
+  ; GCN:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY18]], [[SHL]]
+  ; GCN:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; GCN:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; GCN:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C5]](s32)
+  ; GCN:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; GCN:   $vgpr0 = COPY [[FRAME_INDEX1]](p5)
+  ; GCN:   $vgpr1 = COPY [[FRAME_INDEX]](p5)
+  ; GCN:   [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; GCN:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>)
+  ; GCN:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; GCN:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; GCN:   $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4)
+  ; GCN:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; GCN:   $sgpr12 = COPY [[COPY15]](s32)
+  ; GCN:   $sgpr13 = COPY [[COPY16]](s32)
+  ; GCN:   $sgpr14 = COPY [[COPY17]](s32)
+  ; GCN:   $vgpr31 = COPY [[OR1]](s32)
+  ; GCN:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; GCN:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
+  ; GCN:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32)
+  ; GCN:   [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (dereferenceable load 1 from %ir.out.gep02, addrspace 5)
+  ; GCN:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (dereferenceable load 4 from %ir.out.gep1, addrspace 5)
+  ; GCN:   G_STORE [[LOAD]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   G_STORE [[LOAD1]](s32), [[COPY10]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+  ; GCN:   S_ENDPGM 0
+  %in.val = alloca { i8, i32 }, align 4, addrspace(5)
+  %out.val = alloca { i8, i32 }, align 4, addrspace(5)
+  %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0
+  %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1
+  store i8 3, i8 addrspace(5)* %in.gep0
+  store i32 8, i32 addrspace(5)* %in.gep1
+  call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val)
+  %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0
+  %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1
+  %out.val0 = load i8, i8 addrspace(5)* %out.gep0
+  %out.val1 = load i32, i32 addrspace(5)* %out.gep1
+  store volatile i8 %out.val0, i8 addrspace(1)* undef
+  store volatile i32 %out.val1, i32 addrspace(1)* undef
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 353566a4d3dc79..6b29697ca086e7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -62,6 +62,7 @@ declare hidden void @external_void_func_v8i32(<8 x i32>) #0
 declare hidden void @external_void_func_v16i32(<16 x i32>) #0
 declare hidden void @external_void_func_v32i32(<32 x i32>) #0
 declare hidden void @external_void_func_v32i32_i32(<32 x i32>, i32) #0
+declare hidden void @external_void_func_v32i32_p3_p5(<32 x i32>, i8 addrspace(3)*, i8 addrspace(5)*) #0
 declare hidden void @external_void_func_v32i32_i8_i8_i16(<32 x i32>, i8, i8, i16) #0
 
 ; Structs
@@ -3647,6 +3648,110 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 {
   ret void
 }
 
+define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 {
+  ; CHECK-LABEL: name: test_call_external_void_func_v32i32_p3_p5
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2
+  ; CHECK:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16
+  ; CHECK:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; CHECK:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF
+  ; CHECK:   [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF1]](p1)
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: (load 128 from %ir.ptr0, addrspace 1)
+  ; CHECK:   [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: (load 4 from `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[COPY10]](p1) :: (load 4 from `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>)
+  ; CHECK:   ADJCALLSTACKUP 0, 0, implicit-def $scc
+  ; CHECK:   [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]]
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]]
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4)
+  ; CHECK:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY13]], [[C]](s64)
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY6]]
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY5]]
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]]
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]]
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
+  ; CHECK:   [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; CHECK:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C1]](s32)
+  ; CHECK:   [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY18]], [[SHL]]
+  ; CHECK:   [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+  ; CHECK:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
+  ; CHECK:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C2]](s32)
+  ; CHECK:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+  ; CHECK:   $vgpr0 = COPY [[UV]](s32)
+  ; CHECK:   $vgpr1 = COPY [[UV1]](s32)
+  ; CHECK:   $vgpr2 = COPY [[UV2]](s32)
+  ; CHECK:   $vgpr3 = COPY [[UV3]](s32)
+  ; CHECK:   $vgpr4 = COPY [[UV4]](s32)
+  ; CHECK:   $vgpr5 = COPY [[UV5]](s32)
+  ; CHECK:   $vgpr6 = COPY [[UV6]](s32)
+  ; CHECK:   $vgpr7 = COPY [[UV7]](s32)
+  ; CHECK:   $vgpr8 = COPY [[UV8]](s32)
+  ; CHECK:   $vgpr9 = COPY [[UV9]](s32)
+  ; CHECK:   $vgpr10 = COPY [[UV10]](s32)
+  ; CHECK:   $vgpr11 = COPY [[UV11]](s32)
+  ; CHECK:   $vgpr12 = COPY [[UV12]](s32)
+  ; CHECK:   $vgpr13 = COPY [[UV13]](s32)
+  ; CHECK:   $vgpr14 = COPY [[UV14]](s32)
+  ; CHECK:   $vgpr15 = COPY [[UV15]](s32)
+  ; CHECK:   $vgpr16 = COPY [[UV16]](s32)
+  ; CHECK:   $vgpr17 = COPY [[UV17]](s32)
+  ; CHECK:   $vgpr18 = COPY [[UV18]](s32)
+  ; CHECK:   $vgpr19 = COPY [[UV19]](s32)
+  ; CHECK:   $vgpr20 = COPY [[UV20]](s32)
+  ; CHECK:   $vgpr21 = COPY [[UV21]](s32)
+  ; CHECK:   $vgpr22 = COPY [[UV22]](s32)
+  ; CHECK:   $vgpr23 = COPY [[UV23]](s32)
+  ; CHECK:   $vgpr24 = COPY [[UV24]](s32)
+  ; CHECK:   $vgpr25 = COPY [[UV25]](s32)
+  ; CHECK:   $vgpr26 = COPY [[UV26]](s32)
+  ; CHECK:   $vgpr27 = COPY [[UV27]](s32)
+  ; CHECK:   $vgpr28 = COPY [[UV28]](s32)
+  ; CHECK:   $vgpr29 = COPY [[UV29]](s32)
+  ; CHECK:   $vgpr30 = COPY [[UV30]](s32)
+  ; CHECK:   [[COPY21:%[0-9]+]]:_(p5) = COPY $sp_reg
+  ; CHECK:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; CHECK:   [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C3]](s32)
+  ; CHECK:   G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store 4 into stack, align 16, addrspace 5)
+  ; CHECK:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; CHECK:   [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C4]](s32)
+  ; CHECK:   G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store 4 into stack + 4, addrspace 5)
+  ; CHECK:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; CHECK:   [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C5]](s32)
+  ; CHECK:   G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store 4 into stack + 8, align 8, addrspace 5)
+  ; CHECK:   [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg
+  ; CHECK:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>)
+  ; CHECK:   $sgpr4_sgpr5 = COPY [[COPY11]](p4)
+  ; CHECK:   $sgpr6_sgpr7 = COPY [[COPY12]](p4)
+  ; CHECK:   $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4)
+  ; CHECK:   $sgpr10_sgpr11 = COPY [[COPY14]](s64)
+  ; CHECK:   $sgpr12 = COPY [[COPY15]](s32)
+  ; CHECK:   $sgpr13 = COPY [[COPY16]](s32)
+  ; CHECK:   $sgpr14 = COPY [[COPY17]](s32)
+  ; CHECK:   $vgpr31 = COPY [[OR1]](s32)
+  ; CHECK:   $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_v32i32_p3_p5, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31
+  ; CHECK:   ADJCALLSTACKDOWN 0, 12, implicit-def $scc
+  ; CHECK:   S_ENDPGM 0
+  %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
+  %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
+  %val1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* undef
+  %val2 = load i8 addrspace(5)*, i8 addrspace(5)* addrspace(1)* undef
+  call void @external_void_func_v32i32_p3_p5(<32 x i32> %val0, i8 addrspace(3)* %val1, i8 addrspace(5)* %val2)
+  ret void
+}
+
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_struct_i8_i32
   ; CHECK: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 340392ea8d464d..28f60ca7528dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -1795,6 +1795,62 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ret void
 }
 
+define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1, i8 addrspace(5)* %arg2) #0 {
+  ; CHECK-LABEL: name: void_func_v32i32_p3_p5_i16
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; CHECK:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; CHECK:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; CHECK:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; CHECK:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; CHECK:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; CHECK:   [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8
+  ; CHECK:   [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9
+  ; CHECK:   [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10
+  ; CHECK:   [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11
+  ; CHECK:   [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12
+  ; CHECK:   [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13
+  ; CHECK:   [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14
+  ; CHECK:   [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15
+  ; CHECK:   [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16
+  ; CHECK:   [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17
+  ; CHECK:   [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18
+  ; CHECK:   [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19
+  ; CHECK:   [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20
+  ; CHECK:   [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21
+  ; CHECK:   [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22
+  ; CHECK:   [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23
+  ; CHECK:   [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr24
+  ; CHECK:   [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr25
+  ; CHECK:   [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr26
+  ; CHECK:   [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr27
+  ; CHECK:   [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28
+  ; CHECK:   [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29
+  ; CHECK:   [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30
+  ; CHECK:   [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31
+  ; CHECK:   [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK:   [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.1, align 16, addrspace 5)
+  ; CHECK:   [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK:   [[LOAD1:%[0-9]+]]:_(p5) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5)
+  ; CHECK:   [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31
+  ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32)
+  ; CHECK:   [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF
+  ; CHECK:   [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK:   [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1)
+  ; CHECK:   G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD]](p3), [[COPY33]](p1) :: (volatile store 4 into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   G_STORE [[LOAD1]](p5), [[COPY34]](p1) :: (volatile store 4 into `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1)
+  ; CHECK:   [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]]
+  ; CHECK:   S_SETPC_B64_return [[COPY35]]
+  store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
+  store volatile i8 addrspace(3)* %arg1, i8 addrspace(3)* addrspace(1)* undef
+  store volatile i8 addrspace(5)* %arg2, i8 addrspace(5)* addrspace(1)* undef
+  ret void
+}
+
 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
   ; CHECK-LABEL: name: void_func_v32i32_v2i32_v2f32
   ; CHECK: bb.1 (%ir-block.0):
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
new file mode 100644
index 00000000000000..0f84f12a12574d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+
+define i32 @test_umin_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_umin_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_min_u32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i32 @test_umax_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_umax_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_u32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i32 @test_smin_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_smin_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_min_i32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i32 @test_smax_i32(i32 %a, i32 %b) {
+; CHECK-LABEL: test_smax_i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_i32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define <4 x i32> @test_umin_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_umin_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_min_u32_e32 v0, v0, v4
+; CHECK-NEXT:    v_min_u32_e32 v1, v1, v5
+; CHECK-NEXT:    v_min_u32_e32 v2, v2, v6
+; CHECK-NEXT:    v_min_u32_e32 v3, v3, v7
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_umax_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_umax_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_u32_e32 v0, v0, v4
+; CHECK-NEXT:    v_max_u32_e32 v1, v1, v5
+; CHECK-NEXT:    v_max_u32_e32 v2, v2, v6
+; CHECK-NEXT:    v_max_u32_e32 v3, v3, v7
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_smin_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_smin_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_min_i32_e32 v0, v0, v4
+; CHECK-NEXT:    v_min_i32_e32 v1, v1, v5
+; CHECK-NEXT:    v_min_i32_e32 v2, v2, v6
+; CHECK-NEXT:    v_min_i32_e32 v3, v3, v7
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_smax_v4i32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_smax_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_max_i32_e32 v0, v0, v4
+; CHECK-NEXT:    v_max_i32_e32 v1, v1, v5
+; CHECK-NEXT:    v_max_i32_e32 v2, v2, v6
+; CHECK-NEXT:    v_max_i32_e32 v3, v3, v7
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %r = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
diff --git a/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir b/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir
new file mode 100644
index 00000000000000..1a8feddeea82bf
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir
@@ -0,0 +1,22 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-fold-operands  %s -o - | FileCheck -check-prefix=GCN %s
+
+# We were storing fold candidate uses in an unsigned char, which this exceeds.
+# The use operand overflows and the expected register operand hits the immediate 0.
+# We never have more than a handful of non-implicit operands, so don't try to fold into
+# implicit operands to avoid this problem.
+
+---
+name: op_idx_overflows_uchar
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; GCN-LABEL: name: op_idx_overflows_uchar
+    ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
+    %0:sreg_32 = S_MOV_B32 0
+    %1:vgpr_32 = COPY %0
+    S_ENDPGM 0, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1
+...
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
new file mode 100644
index 00000000000000..345830676abaaa
--- /dev/null
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -0,0 +1,618 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+; The i1 parameter is not codegen-relevant right now.
+
+declare i8 @llvm.abs.i8(i8, i1)
+declare i16 @llvm.abs.i16(i16, i1)
+declare i24 @llvm.abs.i24(i24, i1)
+declare i32 @llvm.abs.i32(i32, i1)
+declare i64 @llvm.abs.i64(i64, i1)
+declare i128 @llvm.abs.i128(i128, i1)
+
+declare <1 x i32> @llvm.abs.v1i32(<1 x i32>, i1)
+declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
+declare <3 x i32> @llvm.abs.v3i32(<3 x i32>, i1)
+declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
+declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1)
+
+declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)
+declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1)
+
+define i8 @test_i8(i8 %a) nounwind {
+; X64-LABEL: test_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    sarb $7, %cl
+; X64-NEXT:    leal (%rdi,%rcx), %eax
+; X64-NEXT:    xorb %cl, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    sarb $7, %cl
+; X86-NEXT:    addb %cl, %al
+; X86-NEXT:    xorb %cl, %al
+; X86-NEXT:    retl
+  %r = call i8 @llvm.abs.i8(i8 %a, i1 false)
+  ret i8 %r
+}
+
+define i16 @test_i16(i16 %a) nounwind {
+; X64-LABEL: test_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negw %ax
+; X64-NEXT:    cmovlw %di, %ax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negw %ax
+; X86-NEXT:    cmovlw %cx, %ax
+; X86-NEXT:    retl
+  %r = call i16 @llvm.abs.i16(i16 %a, i1 false)
+  ret i16 %r
+}
+
+define i24 @test_i24(i24 %a) nounwind {
+; X64-LABEL: test_i24:
+; X64:       # %bb.0:
+; X64-NEXT:    shll $8, %edi
+; X64-NEXT:    sarl $8, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i24:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    sarl $8, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i24 @llvm.abs.i24(i24 %a, i1 false)
+  ret i24 %r
+}
+
+define i32 @test_i32(i32 %a) nounwind {
+; X64-LABEL: test_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i32 @llvm.abs.i32(i32 %a, i1 false)
+  ret i32 %r
+}
+
+define i64 @test_i64(i64 %a) nounwind {
+; X64-LABEL: test_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    cmovlq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    adcl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i64 @llvm.abs.i64(i64 %a, i1 false)
+  ret i64 %r
+}
+
+define i128 @test_i128(i128 %a) nounwind {
+; X64-LABEL: test_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    negq %rax
+; X64-NEXT:    sbbq %rsi, %rdx
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmovnsq %rdi, %rax
+; X64-NEXT:    cmovnsq %rsi, %rdx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    cmovnsl %eax, %esi
+; X86-NEXT:    cmovnsl %ecx, %ebp
+; X86-NEXT:    cmovnsl %edx, %ebx
+; X86-NEXT:    cmovnsl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %ebp, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
+  ret i128 %r
+}
+
+define <1 x i32> @test_v1i32(<1 x i32> %a) nounwind {
+; X64-LABEL: test_v1i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    negl %eax
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_v1i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %r = call <1 x i32> @llvm.abs.v1i32(<1 x i32> %a, i1 false)
+  ret <1 x i32> %r
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %a) nounwind {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsd %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v2i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    retl
+  %r = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %a, i1 false)
+  ret <2 x i32> %r
+}
+
+define <3 x i32> @test_v3i32(<3 x i32> %a) nounwind {
+; SSE-LABEL: test_v3i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v3i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsd %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v3i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %r = call <3 x i32> @llvm.abs.v3i32(<3 x i32> %a, i1 false)
+  ret <3 x i32> %r
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a) nounwind {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsd %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    cmovll %esi, %edi
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    cmovll %edx, %esi
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    negl %edx
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl $4
+  %r = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    paddd %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    paddd %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vpabsd %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpabsd %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X86-LABEL: test_v8i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    cmovll %ebp, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    negl %ebp
+; X86-NEXT:    cmovll %ebx, %ebp
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    negl %ebx
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    negl %edi
+; X86-NEXT:    cmovll %esi, %edi
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negl %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    cmovll %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, 28(%edx)
+; X86-NEXT:    movl %eax, 24(%edx)
+; X86-NEXT:    movl %esi, 20(%edx)
+; X86-NEXT:    movl %edi, 16(%edx)
+; X86-NEXT:    movl %ebx, 12(%edx)
+; X86-NEXT:    movl %ebp, 8(%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 false)
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psraw $15, %xmm1
+; SSE-NEXT:    paddw %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsw %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v8i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negw %cx
+; X86-NEXT:    cmovlw %dx, %cx
+; X86-NEXT:    movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    negw %cx
+; X86-NEXT:    cmovlw %bp, %cx
+; X86-NEXT:    movw %cx, (%esp) # 2-byte Spill
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    negw %bp
+; X86-NEXT:    cmovlw %bx, %bp
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    negw %bx
+; X86-NEXT:    cmovlw %di, %bx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    negw %di
+; X86-NEXT:    cmovlw %si, %di
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    negw %si
+; X86-NEXT:    cmovlw %ax, %si
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    negw %ax
+; X86-NEXT:    cmovlw %cx, %ax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negw %cx
+; X86-NEXT:    cmovlw %dx, %cx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movw %cx, 14(%edx)
+; X86-NEXT:    movw %ax, 12(%edx)
+; X86-NEXT:    movw %si, 10(%edx)
+; X86-NEXT:    movw %di, 8(%edx)
+; X86-NEXT:    movw %bx, 6(%edx)
+; X86-NEXT:    movw %bp, 4(%edx)
+; X86-NEXT:    movzwl (%esp), %eax # 2-byte Folded Reload
+; X86-NEXT:    movw %ax, 2(%edx)
+; X86-NEXT:    movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload
+; X86-NEXT:    movw %ax, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 false)
+  ret <8 x i16> %r
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind {
+; SSE-LABEL: test_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm1
+; SSE-NEXT:    paddb %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpabsb %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v16i8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movb %cl, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb %dl, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %dl
+; X86-NEXT:    xorb %al, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb %ah, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %ah
+; X86-NEXT:    xorb %al, %ah
+; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %ch
+; X86-NEXT:    xorb %al, %ch
+; X86-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb %dh, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %dh
+; X86-NEXT:    xorb %al, %dh
+; X86-NEXT:    movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %bl
+; X86-NEXT:    xorb %al, %bl
+; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb %bh, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %bh
+; X86-NEXT:    xorb %al, %bh
+; X86-NEXT:    movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bh
+; X86-NEXT:    movb %bh, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %bh
+; X86-NEXT:    xorb %al, %bh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %bl
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %bl
+; X86-NEXT:    xorb %al, %bl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dh
+; X86-NEXT:    movb %dh, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %dh
+; X86-NEXT:    xorb %al, %dh
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %ch
+; X86-NEXT:    xorb %al, %ch
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %dl
+; X86-NEXT:    xorb %al, %dl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarb $7, %al
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    movb %al, %ah
+; X86-NEXT:    sarb $7, %ah
+; X86-NEXT:    addb %ah, %al
+; X86-NEXT:    xorb %ah, %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movb %al, 15(%esi)
+; X86-NEXT:    movb %cl, 14(%esi)
+; X86-NEXT:    movb %dl, 13(%esi)
+; X86-NEXT:    movb %ch, 12(%esi)
+; X86-NEXT:    movb %dh, 11(%esi)
+; X86-NEXT:    movb %bl, 10(%esi)
+; X86-NEXT:    movb %bh, 9(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 8(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 7(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 6(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 5(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 4(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 3(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 2(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, 1(%esi)
+; X86-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload
+; X86-NEXT:    movb %al, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl $4
+  %r = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false)
+  ret <16 x i8> %r
+}
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
new file mode 100644
index 00000000000000..31d0822f8090a2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -0,0 +1,662 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i8 @llvm.smax.i8(i8, i8)
+declare i16 @llvm.smax.i16(i16, i16)
+declare i24 @llvm.smax.i24(i24, i24)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i64 @llvm.smax.i64(i64, i64)
+declare i128 @llvm.smax.i128(i128, i128)
+
+declare <1 x i32> @llvm.smax.v1i32(<1 x i32>, <1 x i32>)
+declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>)
+declare <3 x i32> @llvm.smax.v3i32(<3 x i32>, <3 x i32>)
+declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
+
+declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>)
+
+define i8 @test_i8(i8 %a, i8 %b) nounwind {
+; X64-LABEL: test_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %r = call i8 @llvm.smax.i8(i8 %a, i8 %b)
+  ret i8 %r
+}
+
+define i16 @test_i16(i16 %a, i16 %b) nounwind {
+; X64-LABEL: test_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %r = call i16 @llvm.smax.i16(i16 %a, i16 %b)
+  ret i16 %r
+}
+
+define i24 @test_i24(i24 %a, i24 %b) nounwind {
+; X64-LABEL: test_i24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $8, %esi
+; X64-NEXT:    sarl $8, %esi
+; X64-NEXT:    shll $8, %eax
+; X64-NEXT:    sarl $8, %eax
+; X64-NEXT:    cmpl %esi, %eax
+; X64-NEXT:    cmovlel %esi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i24:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    sarl $8, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovlel %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i24 @llvm.smax.i24(i24 %a, i24 %b)
+  ret i24 %r
+}
+
+define i32 @test_i32(i32 %a, i32 %b) nounwind {
+; X64-LABEL: test_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i64 @test_i64(i64 %a, i64 %b) nounwind {
+; X64-LABEL: test_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovgq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    cmoval %ecx, %edi
+; X86-NEXT:    cmpl %edx, %esi
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    cmovgl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+  %r = call i64 @llvm.smax.i64(i64 %a, i64 %b)
+  ret i64 %r
+}
+
+define i128 @test_i128(i128 %a, i128 %b) nounwind {
+; X64-LABEL: test_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    cmovaq %rdi, %rdx
+; X64-NEXT:    cmpq %rcx, %rsi
+; X64-NEXT:    cmovgq %rdi, %rax
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmovgq %rsi, %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ebx, %edx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmoval %edx, %eax
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    cmoval %edx, %ebp
+; X86-NEXT:    cmovel %eax, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    cmovel %ebp, %ebx
+; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    cmoval %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %edx, %ebp
+; X86-NEXT:    cmovgl %edi, %eax
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmovgl %ebp, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl %ebx, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call i128 @llvm.smax.i128(i128 %a, i128 %b)
+  ret i128 %r
+}
+
+define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind {
+; X64-LABEL: test_v1i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovgl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_v1i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    retl
+  %r = call <1 x i32> @llvm.smax.v1i32(<1 x i32> %a, <1 x i32> %b)
+  ret <1 x i32> %r
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v2i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmovgl %esi, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovgl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %r = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %r
+}
+
+define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
+; SSE-LABEL: test_v3i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v3i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v3i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %eax, %ebx
+; X86-NEXT:    cmovgl %ebx, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    cmovgl %edi, %edx
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    cmovgl %esi, %ecx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %r = call <3 x i32> @llvm.smax.v3i32(<3 x i32> %a, <3 x i32> %b)
+  ret <3 x i32> %r
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmovgl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovgl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+  %r = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm0, %xmm4
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X86-LABEL: test_v8i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    cmovgl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    cmovgl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmovgl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovgl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    cmovgl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %ecx, 24(%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %ebp, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v8i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bp, %ax
+; X86-NEXT:    cmovgl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bx, %ax
+; X86-NEXT:    cmovgl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %di, %ax
+; X86-NEXT:    cmovgl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmovgl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmovgl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, 14(%ecx)
+; X86-NEXT:    movw %dx, 12(%ecx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 10(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 8(%ecx)
+; X86-NEXT:    movw %si, 6(%ecx)
+; X86-NEXT:    movw %di, 4(%ecx)
+; X86-NEXT:    movw %bx, 2(%ecx)
+; X86-NEXT:    movw %bp, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %r
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: test_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtb %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v16i8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmovgl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmovgl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmovgl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovgl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movb %dl, 14(%eax)
+; X86-NEXT:    movb %bl, 13(%eax)
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movb %cl, 10(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 9(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 7(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %r
+}
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
new file mode 100644
index 00000000000000..70391534f544c1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -0,0 +1,656 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i8 @llvm.smin.i8(i8, i8)
+declare i16 @llvm.smin.i16(i16, i16)
+declare i24 @llvm.smin.i24(i24, i24)
+declare i32 @llvm.smin.i32(i32, i32)
+declare i64 @llvm.smin.i64(i64, i64)
+declare i128 @llvm.smin.i128(i128, i128)
+
+declare <1 x i32> @llvm.smin.v1i32(<1 x i32>, <1 x i32>)
+declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>)
+declare <3 x i32> @llvm.smin.v3i32(<3 x i32>, <3 x i32>)
+declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
+
+declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>)
+
+define i8 @test_i8(i8 %a, i8 %b) nounwind {
+; X64-LABEL: test_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %r = call i8 @llvm.smin.i8(i8 %a, i8 %b)
+  ret i8 %r
+}
+
+define i16 @test_i16(i16 %a, i16 %b) nounwind {
+; X64-LABEL: test_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %r = call i16 @llvm.smin.i16(i16 %a, i16 %b)
+  ret i16 %r
+}
+
+define i24 @test_i24(i24 %a, i24 %b) nounwind {
+; X64-LABEL: test_i24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $8, %esi
+; X64-NEXT:    sarl $8, %esi
+; X64-NEXT:    shll $8, %eax
+; X64-NEXT:    sarl $8, %eax
+; X64-NEXT:    cmpl %esi, %eax
+; X64-NEXT:    cmovgel %esi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i24:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    sarl $8, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    sarl $8, %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovgel %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i24 @llvm.smin.i24(i24 %a, i24 %b)
+  ret i24 %r
+}
+
+define i32 @test_i32(i32 %a, i32 %b) nounwind {
+; X64-LABEL: test_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i32 @llvm.smin.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i64 @test_i64(i64 %a, i64 %b) nounwind {
+; X64-LABEL: test_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovlq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    cmovbl %ecx, %edi
+; X86-NEXT:    cmpl %edx, %esi
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    cmovll %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+  %r = call i64 @llvm.smin.i64(i64 %a, i64 %b)
+  ret i64 %r
+}
+
+define i128 @test_i128(i128 %a, i128 %b) nounwind {
+; X64-LABEL: test_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    cmovbq %rdi, %rdx
+; X64-NEXT:    cmpq %rcx, %rsi
+; X64-NEXT:    cmovlq %rdi, %rax
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmovlq %rsi, %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    cmovbl %eax, %ebp
+; X86-NEXT:    cmovel %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    cmovel %ebp, %ecx
+; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %edx, 8(%edi)
+; X86-NEXT:    movl %esi, 4(%edi)
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call i128 @llvm.smin.i128(i128 %a, i128 %b)
+  ret i128 %r
+}
+
+define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind {
+; X64-LABEL: test_v1i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovll %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_v1i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    retl
+  %r = call <1 x i32> @llvm.smin.v1i32(<1 x i32> %a, <1 x i32> %b)
+  ret <1 x i32> %r
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v2i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmovll %esi, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovll %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %r = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %r
+}
+
+define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
+; SSE-LABEL: test_v3i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v3i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v3i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %eax, %ebx
+; X86-NEXT:    cmovll %ebx, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    cmovll %edi, %edx
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %r = call <3 x i32> @llvm.smin.v3i32(<3 x i32> %a, <3 x i32> %b)
+  ret <3 x i32> %r
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+  %r = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminsd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X86-LABEL: test_v8i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    cmovll %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    cmovll %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %ecx, 24(%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %ebp, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v8i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bp, %ax
+; X86-NEXT:    cmovll %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bx, %ax
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %di, %ax
+; X86-NEXT:    cmovll %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmovll %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, 14(%ecx)
+; X86-NEXT:    movw %dx, 12(%ecx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 10(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 8(%ecx)
+; X86-NEXT:    movw %si, 6(%ecx)
+; X86-NEXT:    movw %di, 4(%ecx)
+; X86-NEXT:    movw %bx, 2(%ecx)
+; X86-NEXT:    movw %bp, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %r
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: test_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtb %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v16i8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovll %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movb %dl, 14(%eax)
+; X86-NEXT:    movb %bl, 13(%eax)
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movb %cl, 10(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 9(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 7(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %r
+}
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
new file mode 100644
index 00000000000000..14a0248e191457
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -0,0 +1,668 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i8 @llvm.umax.i8(i8, i8)
+declare i16 @llvm.umax.i16(i16, i16)
+declare i24 @llvm.umax.i24(i24, i24)
+declare i32 @llvm.umax.i32(i32, i32)
+declare i64 @llvm.umax.i64(i64, i64)
+declare i128 @llvm.umax.i128(i128, i128)
+
+declare <1 x i32> @llvm.umax.v1i32(<1 x i32>, <1 x i32>)
+declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>)
+declare <3 x i32> @llvm.umax.v3i32(<3 x i32>, <3 x i32>)
+declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
+
+declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>)
+
+define i8 @test_i8(i8 %a, i8 %b) nounwind {
+; X64-LABEL: test_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %r = call i8 @llvm.umax.i8(i8 %a, i8 %b)
+  ret i8 %r
+}
+
+define i16 @test_i16(i16 %a, i16 %b) nounwind {
+; X64-LABEL: test_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %r = call i16 @llvm.umax.i16(i16 %a, i16 %b)
+  ret i16 %r
+}
+
+define i24 @test_i24(i24 %a, i24 %b) nounwind {
+; X64-LABEL: test_i24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $16777215, %esi # imm = 0xFFFFFF
+; X64-NEXT:    andl $16777215, %eax # imm = 0xFFFFFF
+; X64-NEXT:    cmpl %esi, %eax
+; X64-NEXT:    cmovbel %esi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i24:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovbel %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i24 @llvm.umax.i24(i24 %a, i24 %b)
+  ret i24 %r
+}
+
+define i32 @test_i32(i32 %a, i32 %b) nounwind {
+; X64-LABEL: test_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i32 @llvm.umax.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i64 @test_i64(i64 %a, i64 %b) nounwind {
+; X64-LABEL: test_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovaq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    cmoval %ecx, %edi
+; X86-NEXT:    cmpl %edx, %esi
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    cmoval %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+  %r = call i64 @llvm.umax.i64(i64 %a, i64 %b)
+  ret i64 %r
+}
+
+define i128 @test_i128(i128 %a, i128 %b) nounwind {
+; X64-LABEL: test_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    cmovaq %rdi, %rdx
+; X64-NEXT:    cmpq %rcx, %rsi
+; X64-NEXT:    cmovaq %rdi, %rax
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmovaq %rsi, %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %ebx, %edx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmoval %edx, %eax
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    cmoval %edx, %ebp
+; X86-NEXT:    cmovel %eax, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    xorl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    cmovel %ebp, %ebx
+; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    cmoval %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl %edx, %ebp
+; X86-NEXT:    cmoval %edi, %eax
+; X86-NEXT:    cmovel %ecx, %eax
+; X86-NEXT:    cmoval %ebp, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %edx, 12(%ecx)
+; X86-NEXT:    movl %eax, 8(%ecx)
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl %ebx, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call i128 @llvm.umax.i128(i128 %a, i128 %b)
+  ret i128 %r
+}
+
+define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind {
+; X64-LABEL: test_v1i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmoval %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_v1i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    retl
+  %r = call <1 x i32> @llvm.umax.v1i32(<1 x i32> %a, <1 x i32> %b)
+  ret <1 x i32> %r
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v2i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmoval %esi, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmoval %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %r = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %r
+}
+
+define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
+; SSE-LABEL: test_v3i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v3i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v3i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %eax, %ebx
+; X86-NEXT:    cmoval %ebx, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    cmoval %edi, %edx
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    cmoval %esi, %ecx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %r = call <3 x i32> @llvm.umax.v3i32(<3 x i32> %a, <3 x i32> %b)
+  ret <3 x i32> %r
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmoval %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmoval %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+  %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm5, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    pxor %xmm5, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm0, %xmm4
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm5, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmaxud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X86-LABEL: test_v8i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    cmoval %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    cmoval %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmoval %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmoval %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    cmoval %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %ecx, 24(%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %ebp, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pmaxsw %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v8i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bp, %ax
+; X86-NEXT:    cmoval %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bx, %ax
+; X86-NEXT:    cmoval %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %di, %ax
+; X86-NEXT:    cmoval %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmoval %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmoval %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, 14(%ecx)
+; X86-NEXT:    movw %dx, 12(%ecx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 10(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 8(%ecx)
+; X86-NEXT:    movw %si, 6(%ecx)
+; X86-NEXT:    movw %di, 4(%ecx)
+; X86-NEXT:    movw %bx, 2(%ecx)
+; X86-NEXT:    movw %bp, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %r
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: test_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pmaxub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v16i8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmoval %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmoval %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmoval %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmoval %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movb %dl, 14(%eax)
+; X86-NEXT:    movb %bl, 13(%eax)
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movb %cl, 10(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 9(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 7(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %r
+}
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
new file mode 100644
index 00000000000000..234c4faf6cd2b4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -0,0 +1,667 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2
+; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+
+declare i8 @llvm.umin.i8(i8, i8)
+declare i16 @llvm.umin.i16(i16, i16)
+declare i24 @llvm.umin.i24(i24, i24)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i64 @llvm.umin.i64(i64, i64)
+declare i128 @llvm.umin.i128(i128, i128)
+
+declare <1 x i32> @llvm.umin.v1i32(<1 x i32>, <1 x i32>)
+declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>)
+declare <3 x i32> @llvm.umin.v3i32(<3 x i32>, <3 x i32>)
+declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
+
+declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>)
+declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>)
+
+define i8 @test_i8(i8 %a, i8 %b) nounwind {
+; X64-LABEL: test_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpb %al, %dil
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpb %al, %cl
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+  %r = call i8 @llvm.umin.i8(i8 %a, i8 %b)
+  ret i8 %r
+}
+
+define i16 @test_i16(i16 %a, i16 %b) nounwind {
+; X64-LABEL: test_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpw %ax, %di
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+  %r = call i16 @llvm.umin.i16(i16 %a, i16 %b)
+  ret i16 %r
+}
+
+define i24 @test_i24(i24 %a, i24 %b) nounwind {
+; X64-LABEL: test_i24:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $16777215, %esi # imm = 0xFFFFFF
+; X64-NEXT:    andl $16777215, %eax # imm = 0xFFFFFF
+; X64-NEXT:    cmpl %esi, %eax
+; X64-NEXT:    cmovael %esi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i24:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl %eax, %ecx
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovael %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i24 @llvm.umin.i24(i24 %a, i24 %b)
+  ret i24 %r
+}
+
+define i32 @test_i32(i32 %a, i32 %b) nounwind {
+; X64-LABEL: test_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    retl
+  %r = call i32 @llvm.umin.i32(i32 %a, i32 %b)
+  ret i32 %r
+}
+
+define i64 @test_i64(i64 %a, i64 %b) nounwind {
+; X64-LABEL: test_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    cmovbq %rdi, %rax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    cmovbl %ecx, %edi
+; X86-NEXT:    cmpl %edx, %esi
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    cmovel %edi, %eax
+; X86-NEXT:    cmovbl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+  %r = call i64 @llvm.umin.i64(i64 %a, i64 %b)
+  ret i64 %r
+}
+
+define i128 @test_i128(i128 %a, i128 %b) nounwind {
+; X64-LABEL: test_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    cmpq %rdx, %rdi
+; X64-NEXT:    cmovbq %rdi, %rdx
+; X64-NEXT:    cmpq %rcx, %rsi
+; X64-NEXT:    cmovbq %rdi, %rax
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmovbq %rsi, %rcx
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_i128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    cmovbl %eax, %ebp
+; X86-NEXT:    cmovel %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    xorl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    cmovel %ebp, %ecx
+; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    cmpl %eax, %edi
+; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovbl %edi, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %eax, 12(%edi)
+; X86-NEXT:    movl %edx, 8(%edi)
+; X86-NEXT:    movl %esi, 4(%edi)
+; X86-NEXT:    movl %ecx, (%edi)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call i128 @llvm.umin.i128(i128 %a, i128 %b)
+  ret i128 %r
+}
+
+define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind {
+; X64-LABEL: test_v1i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    cmpl %esi, %edi
+; X64-NEXT:    cmovbl %edi, %eax
+; X64-NEXT:    retq
+;
+; X86-LABEL: test_v1i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    retl
+  %r = call <1 x i32> @llvm.umin.v1i32(<1 x i32> %a, <1 x i32> %b)
+  ret <1 x i32> %r
+}
+
+define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
+; SSE-LABEL: test_v2i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v2i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v2i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
+; X86-NEXT:    cmovbl %esi, %eax
+; X86-NEXT:    cmpl %edx, %ecx
+; X86-NEXT:    cmovbl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+  %r = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b)
+  ret <2 x i32> %r
+}
+
+define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind {
+; SSE-LABEL: test_v3i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v3i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v3i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl %eax, %ebx
+; X86-NEXT:    cmovbl %ebx, %eax
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    cmovbl %edi, %edx
+; X86-NEXT:    cmpl %ecx, %esi
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+  %r = call <3 x i32> @llvm.umin.v3i32(<3 x i32> %a, <3 x i32> %b)
+  ret <3 x i32> %r
+}
+
+define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
+; SSE-LABEL: test_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v4i32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v4i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmovbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+  %r = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
+; SSE-LABEL: test_v8i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pxor %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pxor %xmm4, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm6
+; SSE-NEXT:    pand %xmm6, %xmm0
+; SSE-NEXT:    pandn %xmm2, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: test_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpminud %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: test_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; X86-LABEL: test_v8i32:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebp, %eax
+; X86-NEXT:    cmovbl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ebx, %eax
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edi, %eax
+; X86-NEXT:    cmovbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl %ecx, %eax
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %eax, %edx
+; X86-NEXT:    cmovbl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %eax, 28(%edx)
+; X86-NEXT:    movl %ecx, 24(%edx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 20(%edx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, 16(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %ebp, (%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b)
+  ret <8 x i32> %r
+}
+
+define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
+; SSE-LABEL: test_v8i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pminsw %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v8i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v8i16:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bp, %ax
+; X86-NEXT:    cmovbl %eax, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %bx, %ax
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %di, %ax
+; X86-NEXT:    cmovbl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %si, %ax
+; X86-NEXT:    cmovbl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %cx, %ax
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpw %dx, %ax
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpw %ax, %cx
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movw %ax, 14(%ecx)
+; X86-NEXT:    movw %dx, 12(%ecx)
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 10(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movw %ax, 8(%ecx)
+; X86-NEXT:    movw %si, 6(%ecx)
+; X86-NEXT:    movw %di, 4(%ecx)
+; X86-NEXT:    movw %bx, 2(%ecx)
+; X86-NEXT:    movw %bp, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %r
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
+; SSE-LABEL: test_v16i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pminub %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: test_v16i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+;
+; X86-LABEL: test_v16i8:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %bl, %al
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %dl, %al
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpb %cl, %al
+; X86-NEXT:    cmovbl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movb %cl, 15(%eax)
+; X86-NEXT:    movb %dl, 14(%eax)
+; X86-NEXT:    movb %bl, 13(%eax)
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movb %cl, 12(%eax)
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movb %cl, 11(%eax)
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:    movb %cl, 10(%eax)
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 9(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 8(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 7(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 6(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 5(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 4(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 3(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 2(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, 1(%eax)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+  %r = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %r
+}
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index 612a38f008fc55..dada59099d81be 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -231,12 +231,12 @@ define void @f7_nested_conds(i32* %a, i32 %b) {
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp ne i32 [[A_V]], 0
 ; CHECK-NEXT:    br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       false:
-; CHECK-NEXT:    br i1 true, label [[TRUE_2:%.*]], label [[TRUE]]
+; CHECK-NEXT:    br label [[TRUE_2:%.*]]
 ; CHECK:       true.2:
 ; CHECK-NEXT:    call void @use(i1 true)
 ; CHECK-NEXT:    ret void
 ; CHECK:       true:
-; CHECK-NEXT:    store i32 [[B:%.*]], i32* [[A]]
+; CHECK-NEXT:    store i32 [[B:%.*]], i32* [[A]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SCCP/predicateinfo-cond.ll b/llvm/test/Transforms/SCCP/predicateinfo-cond.ll
index 8ed96ec9301f5c..1443cc72c2ef8d 100644
--- a/llvm/test/Transforms/SCCP/predicateinfo-cond.ll
+++ b/llvm/test/Transforms/SCCP/predicateinfo-cond.ll
@@ -105,7 +105,7 @@ define void @pr46814(i32 %a) {
 ; CHECK-NEXT:    [[C3:%.*]] = and i1 [[C1]], [[C2]]
 ; CHECK-NEXT:    br i1 [[C3]], label [[IF_1:%.*]], label [[EXIT:%.*]]
 ; CHECK:       if.1:
-; CHECK-NEXT:    br i1 true, label [[IF_2:%.*]], label [[EXIT]]
+; CHECK-NEXT:    br label [[IF_2:%.*]]
 ; CHECK:       if.2:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[EXIT]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
index e1c7b3d5662d0b..9e9d1256c4cc01 100644
--- a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
+++ b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll
@@ -136,13 +136,12 @@ define internal i1 @test2_g(%t1* %h, i32 %i) {
 ; CHECK-LABEL: define {{[^@]+}}@test2_g
 ; CHECK-SAME: (%t1* [[H:%.*]], i32 [[I:%.*]])
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 true, label [[LAND_RHS:%.*]], label [[LAND_END:%.*]]
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
 ; CHECK:       land.rhs:
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 (...) @test2_j()
 ; CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp ne i32 [[CALL]], 0
-; CHECK-NEXT:    br label [[LAND_END]]
+; CHECK-NEXT:    br label [[LAND_END:%.*]]
 ; CHECK:       land.end:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TOBOOL1]], [[LAND_RHS]] ]
 ; CHECK-NEXT:    ret i1 undef
 ;
 entry:
@@ -196,10 +195,9 @@ define internal i32 @test3_k(i8 %h, i32 %i) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[CONV]] to %t1*
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i1 [ undef, [[ENTRY:%.*]] ], [ false, [[LOOP]] ]
 ; CHECK-NEXT:    [[CALL:%.*]] = call i1 @test3_g(%t1* [[TMP1]], i32 0)
 ; CHECK-NEXT:    call void @use.1(i1 false)
-; CHECK-NEXT:    br i1 false, label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i32 undef
 ;
diff --git a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll
index 7596a56b81229c..17b37f000407cb 100644
--- a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll
+++ b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll
@@ -5,11 +5,11 @@
 define void @barney() {
 ; CHECK-LABEL: @barney(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb9
+; CHECK-NEXT:    br label [[BB9:%.*]]
 ; CHECK:       bb6:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb9:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[BB6:%.*]]
 ;
 bb:
   br label %bb9
@@ -29,9 +29,9 @@ bb9:                                              ; preds = %bb
 define void @blam() {
 ; CHECK-LABEL: @blam(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb16
+; CHECK-NEXT:    br label [[BB16:%.*]]
 ; CHECK:       bb16:
-; CHECK-NEXT:    br label %bb38
+; CHECK-NEXT:    br label [[BB38:%.*]]
 ; CHECK:       bb38:
 ; CHECK-NEXT:    unreachable
 ;
@@ -62,9 +62,9 @@ bb38:                                             ; preds = %bb16
 define void @hoge() {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    br label %bb2
+; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    unreachable
+; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
 ; CHECK-NEXT:    unreachable
 ;
diff --git a/llvm/test/Transforms/SCCP/switch.ll b/llvm/test/Transforms/SCCP/switch.ll
index d895c5629cd41f..3587587bcb91d7 100644
--- a/llvm/test/Transforms/SCCP/switch.ll
+++ b/llvm/test/Transforms/SCCP/switch.ll
@@ -23,15 +23,11 @@ define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]]
 ; CHECK:       switch:
-; CHECK-NEXT:    switch i32 -1, label [[SWITCH_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 0, label [[END]]
-; CHECK-NEXT:    i32 1, label [[END]]
-; CHECK-NEXT:    ]
+; CHECK-NEXT:    br label [[SWITCH_DEFAULT:%.*]]
 ; CHECK:       switch.default:
 ; CHECK-NEXT:    ret i32 -1
 ; CHECK:       end:
-; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[X:%.*]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ], [ 1, [[SWITCH]] ]
-; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
 entry:
   br i1 %c, label %switch, label %end
@@ -77,6 +73,53 @@ end:
   ret i32 %phi
 }
 
+define i32 @test_duplicate_successors_phi_3(i1 %c1, i32 %x) {
+; CHECK-LABEL: @test_duplicate_successors_phi_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[SWITCH:%.*]], label [[SWITCH_1:%.*]]
+; CHECK:       switch:
+; CHECK-NEXT:    [[C2:%.*]] = icmp ult i32 [[X:%.*]], 3
+; CHECK-NEXT:    call void @llvm.assume(i1 [[C2]])
+; CHECK-NEXT:    switch i32 [[X]], label [[SWITCH_DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[SWITCH_DEFAULT]]
+; CHECK-NEXT:    i32 1, label [[SWITCH_0:%.*]]
+; CHECK-NEXT:    i32 2, label [[SWITCH_0]]
+; CHECK-NEXT:    i32 3, label [[SWITCH_1]]
+; CHECK-NEXT:    i32 4, label [[SWITCH_1]]
+; CHECK-NEXT:    ]
+; CHECK:       switch.default:
+; CHECK-NEXT:    ret i32 -1
+; CHECK:       switch.0:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       switch.1:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ 0, [[SWITCH]] ], [ 0, [[SWITCH]] ]
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  br i1 %c1, label %switch, label %switch.1
+
+switch:
+  %c2 = icmp ult i32 %x, 3
+  call void @llvm.assume(i1 %c2)
+  switch i32 %x, label %switch.default [
+  i32 0, label %switch.default
+  i32 1, label %switch.0
+  i32 2, label %switch.0
+  i32 3, label %switch.1
+  i32 4, label %switch.1
+  ]
+
+switch.default:
+  ret i32 -1
+
+switch.0:
+  ret i32 0
+
+switch.1:
+  %phi = phi i32 [ %x, %entry ], [ 0, %switch ], [ 0, %switch ]
+  ret i32 %phi
+}
+
 define i32 @test_local_range(i32 %x) {
 ; CHECK-LABEL: @test_local_range(
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[X:%.*]], 3
diff --git a/llvm/test/Transforms/SCCP/widening.ll b/llvm/test/Transforms/SCCP/widening.ll
index 2703bdb27dff4f..23a88c35a93eac 100644
--- a/llvm/test/Transforms/SCCP/widening.ll
+++ b/llvm/test/Transforms/SCCP/widening.ll
@@ -216,11 +216,11 @@ define void @rotated_loop_2(i32 %x) {
 ; IPSCCP:       bb3:
 ; IPSCCP-NEXT:    br label [[EXIT]]
 ; IPSCCP:       exit:
-; IPSCCP-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 3, [[BB1]] ], [ 2, [[BB2]] ], [ 5, [[BB3]] ], [ [[A:%.*]], [[EXIT]] ]
-; IPSCCP-NEXT:    [[A]] = add i32 [[P]], 1
+; IPSCCP-NEXT:    [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 3, [[BB1]] ], [ 2, [[BB2]] ], [ 5, [[BB3]] ]
+; IPSCCP-NEXT:    [[A:%.*]] = add i32 [[P]], 1
 ; IPSCCP-NEXT:    call void @use(i1 true)
 ; IPSCCP-NEXT:    call void @use(i1 false)
-; IPSCCP-NEXT:    br i1 false, label [[EXIT]], label [[EXIT_1:%.*]]
+; IPSCCP-NEXT:    br label [[EXIT_1:%.*]]
 ; IPSCCP:       exit.1:
 ; IPSCCP-NEXT:    ret void
 ;
@@ -451,10 +451,10 @@ define void @foo(i64* %arg) {
 ; SCCP-NEXT:    [[TMP7:%.*]] = sub i64 3, [[TMP6]]
 ; SCCP-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 1
 ; SCCP-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; SCCP-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; SCCP-NEXT:    [[TMP0:%.*]] = zext i32 [[TMP9]] to i64
 ; SCCP-NEXT:    br label [[BB11:%.*]]
 ; SCCP:       bb11:
-; SCCP-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ]
+; SCCP-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP0]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ]
 ; SCCP-NEXT:    br label [[BB13:%.*]]
 ; SCCP:       bb13:
 ; SCCP-NEXT:    [[C_1:%.*]] = icmp eq i64 [[TMP12]], 6
@@ -489,10 +489,10 @@ define void @foo(i64* %arg) {
 ; IPSCCP-NEXT:    [[TMP7:%.*]] = sub i64 3, [[TMP6]]
 ; IPSCCP-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 1
 ; IPSCCP-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; IPSCCP-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; IPSCCP-NEXT:    [[TMP0:%.*]] = zext i32 [[TMP9]] to i64
 ; IPSCCP-NEXT:    br label [[BB11:%.*]]
 ; IPSCCP:       bb11:
-; IPSCCP-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ]
+; IPSCCP-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP0]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ]
 ; IPSCCP-NEXT:    br label [[BB13:%.*]]
 ; IPSCCP:       bb13:
 ; IPSCCP-NEXT:    [[C_1:%.*]] = icmp eq i64 [[TMP12]], 6
diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
index 399c6dd4f7de01..7b2a985bba6616 100644
--- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
@@ -1,4 +1,4 @@
-//===- FunctionPropertiesAnalysisTest.cpp - function properties unit tests-===//
+//===- FunctionPropertiesAnalysisTest.cpp - Function Properties Unit Tests-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,6 +8,7 @@
 
 #include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -15,42 +16,49 @@
 #include "gtest/gtest.h"
 
 using namespace llvm;
+namespace {
 
-static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
-  SMDiagnostic Err;
-  std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
-  if (!Mod)
-    Err.print("MLAnalysisTests", errs());
-  return Mod;
-}
+class FunctionPropertiesAnalysisTest : public testing::Test {
+protected:
+  std::unique_ptr<DominatorTree> DT;
+  std::unique_ptr<LoopInfo> LI;
+
+  FunctionPropertiesInfo buildFPI(Function &F) {
+    DT.reset(new DominatorTree(F));
+    LI.reset(new LoopInfo(*DT));
+    return FunctionPropertiesInfo::getFunctionPropertiesInfo(F, *LI);
+  }
 
-TEST(FunctionPropertiesTest, BasicTest) {
+  std::unique_ptr<Module> makeLLVMModule(LLVMContext &C, const char *IR) {
+    SMDiagnostic Err;
+    std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
+    if (!Mod)
+      Err.print("MLAnalysisTests", errs());
+    return Mod;
+  }
+};
+
+TEST_F(FunctionPropertiesAnalysisTest, BasicTest) {
   LLVMContext C;
-  std::unique_ptr<Module> M = parseIR(C,
-                                      R"IR(
+  std::unique_ptr<Module> M = makeLLVMModule(C,
+                                             R"IR(
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
-
 declare i32 @f1(i32)
 declare i32 @f2(i32)
-
 define i32 @branches(i32) {
   %cond = icmp slt i32 %0, 3
   br i1 %cond, label %then, label %else
-
 then:
   %ret.1 = call i32 @f1(i32 %0)
   br label %last.block
-
 else:
   %ret.2 = call i32 @f2(i32 %0)
   br label %last.block
-
 last.block:
   %ret = phi i32 [%ret.1, %then], [%ret.2, %else]
   ret i32 %ret
 }
-
 define internal i32 @top() {
   %1 = call i32 @branches(i32 2)
   %2 = call i32 @f1(i32 %1)
@@ -58,20 +66,28 @@ define internal i32 @top() {
 }
 )IR");
 
-  FunctionAnalysisManager FAM;
-  FunctionPropertiesAnalysis FA;
-
-  auto BranchesFeatures = FA.run(*M->getFunction("branches"), FAM);
+  Function *BranchesFunction = M->getFunction("branches");
+  FunctionPropertiesInfo BranchesFeatures = buildFPI(*BranchesFunction);
   EXPECT_EQ(BranchesFeatures.BasicBlockCount, 4);
   EXPECT_EQ(BranchesFeatures.BlocksReachedFromConditionalInstruction, 2);
-  EXPECT_EQ(BranchesFeatures.DirectCallsToDefinedFunctions, 0);
   // 2 Users: top is one. The other is added because @branches is not internal,
   // so it may have external callers.
   EXPECT_EQ(BranchesFeatures.Uses, 2);
+  EXPECT_EQ(BranchesFeatures.DirectCallsToDefinedFunctions, 0);
+  EXPECT_EQ(BranchesFeatures.LoadInstCount, 0);
+  EXPECT_EQ(BranchesFeatures.StoreInstCount, 0);
+  EXPECT_EQ(BranchesFeatures.MaxLoopDepth, 0);
+  EXPECT_EQ(BranchesFeatures.TopLevelLoopCount, 0);
 
-  auto TopFeatures = FA.run(*M->getFunction("top"), FAM);
+  Function *TopFunction = M->getFunction("top");
+  FunctionPropertiesInfo TopFeatures = buildFPI(*TopFunction);
   EXPECT_EQ(TopFeatures.BasicBlockCount, 1);
   EXPECT_EQ(TopFeatures.BlocksReachedFromConditionalInstruction, 0);
-  EXPECT_EQ(TopFeatures.DirectCallsToDefinedFunctions, 1);
   EXPECT_EQ(TopFeatures.Uses, 0);
+  EXPECT_EQ(TopFeatures.DirectCallsToDefinedFunctions, 1);
+  EXPECT_EQ(BranchesFeatures.LoadInstCount, 0);
+  EXPECT_EQ(BranchesFeatures.StoreInstCount, 0);
+  EXPECT_EQ(BranchesFeatures.MaxLoopDepth, 0);
+  EXPECT_EQ(BranchesFeatures.TopLevelLoopCount, 0);
 }
+} // end anonymous namespace
\ No newline at end of file
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 6ffc181fed67a1..2c327276610aef 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -223,10 +223,10 @@ static bool isTiled(AffineMap map, ArrayRef<Value> tileSizes) {
 }
 
 static SmallVector<Value, 4> makeTiledViews(OpBuilder &b, Location loc,
-                                            LinalgOp linalgOp,
+                                            LinalgOp linalgOp, AffineMap map,
                                             ArrayRef<Value> ivs,
                                             ArrayRef<Value> tileSizes,
-                                            ArrayRef<Value> viewSizes) {
+                                            ArrayRef<Value> allViewSizes) {
   assert(linalgOp.hasBufferSemantics() &&
          "expected linalg op with buffer semantics");
   assert(ivs.size() == static_cast<size_t>(llvm::count_if(
@@ -236,6 +236,7 @@ static SmallVector<Value, 4> makeTiledViews(OpBuilder &b, Location loc,
 
   using namespace edsc::op;
 
+  auto viewSizes = applyMapToValues(b, loc, map, allViewSizes);
   // Construct (potentially temporary) mins and maxes on which to apply maps
   // that define tile subviews.
   SmallVector<Value, 8> lbs, subViewSizes;
@@ -356,7 +357,7 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
     return llvm::None;
 
   // 2. Build the tiled loop ranges.
-  auto viewSizes = getViewSizes(b, op);
+  auto allViewSizes = getViewSizes(b, op);
   // The flattened loopToOperandRangesMaps is expected to be an invertible
   // permutation map (asserted in the inverse calculation).
   auto mapsRange = op.indexing_maps().getAsRange<AffineMapAttr>();
@@ -369,7 +370,7 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
   SmallVector<SubViewOp::Range, 4> loopRanges;
   LoopIndexToRangeIndexMap loopIndexToRangeIndex;
   std::tie(loopRanges, loopIndexToRangeIndex) = makeTiledLoopRanges(
-      b, scope.getLocation(), viewSizesToLoopsMap, viewSizes, tileSizes);
+      b, scope.getLocation(), viewSizesToLoopsMap, allViewSizes, tileSizes);
   if (!options.interchangeVector.empty())
     applyPermutationToVector(loopRanges, options.interchangeVector);
 
@@ -395,7 +396,8 @@ Optional<TiledLinalgOp> static tileLinalgOpImpl(
         if (!options.interchangeVector.empty())
           ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues);
 
-        auto views = makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes);
+        auto views = makeTiledViews(b, loc, op, viewSizesToLoopsMap, ivValues,
+                                    tileSizes, allViewSizes);
         auto operands = getAssumedNonViewOperands(op);
         views.append(operands.begin(), operands.end());
         res = op.clone(b, loc, views);
diff --git a/mlir/test/Dialect/Linalg/tile_conv.mlir b/mlir/test/Dialect/Linalg/tile_conv.mlir
index 1bbb8b60382ef9..a08a2f1e585c64 100644
--- a/mlir/test/Dialect/Linalg/tile_conv.mlir
+++ b/mlir/test/Dialect/Linalg/tile_conv.mlir
@@ -9,36 +9,38 @@ func @conv(%arg0: memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, %arg1:
   linalg.conv(%arg0, %arg1, %arg2) {dilations = [10, 20], strides = [30, 40]} : memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>, memref<?x?x?x?xf32, offset: ?, strides: [?, ?, ?, 1]>
   return
 }
-// TILE-23004-LABEL: func @conv(
-//       TILE-23004:   %{{.*}}: memref<?x?x?x?xf32, #[[$strided4D]]>, %{{.*}}: memref<?x?x?x?xf32, #[[$strided4D]]>, %{{.*}}: memref<?x?x?x?xf32, #[[$strided4D]]>) {
-//       TILE-23004-DAG: %[[C0:.*]] = constant 0 : index
-//       TILE-23004-DAG: %[[C2:.*]] = constant 2 : index
-//       TILE-23004-DAG: %[[C3:.*]] = constant 3 : index
-//       TILE-23004-DAG: %[[C4:.*]] = constant 4 : index
-//       TILE-23004:   %[[Q:.*]] = dim %{{.*}}, %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:   %[[B:.*]] = dim %{{.*}}, %c0 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:   %[[PaddedInput0:.*]] = dim %{{.*}}, %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:   %[[X0:.*]] = dim %{{.*}}, %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004: func @conv(
+//  TILE-23004-SAME:   %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?x?x?xf32, #[[$strided4D]]>
+//  TILE-23004-SAME:   %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?x?x?xf32, #[[$strided4D]]>
+//  TILE-23004-SAME:   %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?x?x?xf32, #[[$strided4D]]>)
+//   TILE-23004-DAG:   %[[C0:.*]] = constant 0 : index
+//   TILE-23004-DAG:   %[[C2:.*]] = constant 2 : index
+//   TILE-23004-DAG:   %[[C3:.*]] = constant 3 : index
+//   TILE-23004-DAG:   %[[C4:.*]] = constant 4 : index
+//       TILE-23004:   %[[Z0:.*]] = dim %[[ARG0]], %c0 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:   %[[Q:.*]] = dim %[[ARG0]], %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:   %[[B:.*]] = dim %[[ARG1]], %c0 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:   %[[X0:.*]] = dim %[[ARG2]], %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
 //       TILE-23004:   scf.for %[[ivI:.*]] = %{{.*}} to %[[B]] step %{{.*}} {
 //       TILE-23004:     scf.for %[[ivJ:.*]] = %{{.*}} to %[[X0]] step %{{.*}} {
 //       TILE-23004:       scf.for %[[ivK:.*]] = %{{.*}} to %[[Q]] step %{{.*}} {
-//       TILE-23004:         %[[Z0:.*]] = dim %{{.*}}, %c0 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:         %[[Z1:.*]] = dim %{{.*}}, %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:         %[[Z2:.*]] = dim %{{.*}}, %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[Z0_1:.*]] = dim %[[ARG0]], %c0 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[Z1:.*]] = dim %[[ARG0]], %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[Z2:.*]] = dim %[[ARG0]], %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
 //       TILE-23004:         %[[szK:.*]] = affine.min #[[$bound_map_4]](%[[ivK]])[%[[Z2]]]
-//       TILE-23004:         %[[K:.*]] = dim %{{.*}}, %c3 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:         %[[FilterView:.*]] = subview %{{.*}}[0, 0, %[[ivK]], 0] [%[[Z0]], %[[Z1]], %[[szK]], %[[K]]] [1, 1, 1, 1] : memref<?x?x?x?xf32, #[[$strided4D]]> to memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[K:.*]] = dim %[[ARG0]], %c3 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[FilterView:.*]] = subview %{{.*}}[0, 0, %[[ivK]], 0] [%[[Z0_1]], %[[Z1]], %[[szK]], %[[K]]] [1, 1, 1, 1] : memref<?x?x?x?xf32, #[[$strided4D]]> to memref<?x?x?x?xf32, #[[$strided4D]]>
 //
 //       TILE-23004:         %[[J1:.*]] = affine.apply #[[$D0x30pS0x10]](%[[ivJ]])
-//       TILE-23004:         %[[PaddedInput0b:.*]] = dim %{{.*}}, %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:         %[[I1pStep:.*]] = affine.min #[[$S0x10p90D0x30pS1]](%[[ivJ]])[%[[PaddedInput0]], %[[PaddedInput0b]]]
-//       TILE-23004:         %[[SZ2:.*]] = dim %{{.*}}, %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:         %[[dim3:.*]] = dim %{{.*}}, %c3
+//       TILE-23004:         %[[PaddedInput0b:.*]] = dim %[[ARG1]], %c1 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[I1pStep:.*]] = affine.min #[[$S0x10p90D0x30pS1]](%[[ivJ]])[%[[Z0]], %[[PaddedInput0b]]]
+//       TILE-23004:         %[[SZ2:.*]] = dim %[[ARG1]], %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[dim3:.*]] = dim %[[ARG1]], %c3
 //       TILE-23004:         %[[sz3:.*]] = affine.min #[[$bound_map_4]](%[[ivK]])[%[[dim3]]]
 //       TILE-23004:         %[[InputView:.*]] = subview %{{.*}}[%[[ivI]], %[[J1]], 0, %[[ivK]]] [%{{.*}}, %{{.*}}, %[[SZ2]], %[[sz3]]] [1, 1, 1, 1] : memref<?x?x?x?xf32, #[[$strided4D]]> to memref<?x?x?x?xf32, #[[$strided4D]]>
 //
-//       TILE-23004:         %[[X0:.*]] = dim %{{.*}}, %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
-//       TILE-23004:         %[[X1:.*]] = dim %{{.*}}, %c3 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[X0:.*]] = dim %[[ARG2]], %c2 : memref<?x?x?x?xf32, #[[$strided4D]]>
+//       TILE-23004:         %[[X1:.*]] = dim %[[ARG2]], %c3 : memref<?x?x?x?xf32, #[[$strided4D]]>
 //       TILE-23004:         %[[OutputView:.*]] = subview %{{.*}}[%[[ivI]], %[[ivJ]], 0, 0] [%{{.*}}, %{{.*}}, %[[X0]], %[[X1]]] [1, 1, 1, 1] : memref<?x?x?x?xf32, #[[$strided4D]]> to memref<?x?x?x?xf32, #[[$strided4D]]>
 //
 //       TILE-23004:         linalg.conv(%[[FilterView]], %[[InputView]], %[[OutputView]]) {dilations = [10, 20], strides = [30, 40]} : memref<?x?x?x?xf32, #[[$strided4D]]>, memref<?x?x?x?xf32, #[[$strided4D]]>, memref<?x?x?x?xf32, #[[$strided4D]]>
diff --git a/mlir/test/Dialect/Linalg/tile_simple_conv.mlir b/mlir/test/Dialect/Linalg/tile_simple_conv.mlir
new file mode 100644
index 00000000000000..f854f7570fef3f
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/tile_simple_conv.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" | FileCheck %s
+
+//  CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)>
+//  CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (s0 + 3, -d0 + s1)>
+//  CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (s0 + 4, -d0 + s1)>
+//  CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)>
+//  CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0)[s0] -> (4, -d0 + s0)>
+
+func @conv(%arg0 : memref<?x?x?x?xf32>, %arg1 : memref<?x?x?x?xf32>, %arg2 : memref<?x?x?x?xf32>) {
+  linalg.conv(%arg0, %arg1, %arg2) : memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>
+  return
+}
+
+//       CHECK: func @conv
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]*]]: memref<?x?x?x?xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9_]*]]: memref<?x?x?x?xf32>
+//  CHECK-SAME:   %[[ARG2:[a-zA-Z0-9_]*]]: memref<?x?x?x?xf32>
+//   CHECK-DAG:   %[[C0:.*]] = constant 0 : index
+//   CHECK-DAG:   %[[C1:.*]] = constant 1 : index
+//   CHECK-DAG:   %[[C2:.*]] = constant 2 : index
+//   CHECK-DAG:   %[[C3:.*]] = constant 3 : index
+//   CHECK-DAG:   %[[C4:.*]] = constant 4 : index
+//       CHECK:   %[[T0:.*]] = dim %[[ARG0]], %[[C0]]
+//       CHECK:   %[[T1:.*]] = dim %[[ARG0]], %[[C1]]
+//       CHECK:   %[[T2:.*]] = dim %[[ARG1]], %[[C0]]
+//       CHECK:   %[[T3:.*]] = dim %[[ARG2]], %[[C1]]
+//       CHECK:   %[[T4:.*]] = dim %[[ARG2]], %[[C2]]
+//       CHECK:   scf.for %[[ARG3:.*]] = %[[C0]] to %[[T2]] step %[[C2]]
+//       CHECK:     scf.for %[[ARG4:.*]] = %[[C0]] to %[[T3]] step %[[C3]]
+//       CHECK:       scf.for %[[ARG5:.*]] = %[[C0]] to %[[T4]] step %[[C4]]
+//       CHECK:         %[[T5:.*]] = dim %[[ARG1]], %[[C0]]
+//       CHECK:         %[[T6:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T5]]]
+//       CHECK:         %[[T7:.*]] = dim %[[ARG1]], %[[C1]]
+//       CHECK:         %[[T8:.*]] = affine.min #[[MAP1]](%[[ARG4]])[%[[T0]], %[[T7]]]
+//       CHECK:         %[[T9:.*]] = dim %[[ARG1]], %[[C2]]
+//       CHECK:         %[[T10:.*]] = affine.min #[[MAP2]](%[[ARG5]])[%[[T1]], %[[T9]]]
+//       CHECK:         %[[T11:.*]] = dim %[[ARG1]], %[[C3]]
+//       CHECK:         %[[SV1:.*]] = subview %[[ARG1]][%[[ARG3]], %[[ARG4]], %[[ARG5]], 0]
+//  CHECK-SAME:                                        [%[[T6]], %[[T8]], %[[T10]], %[[T11]]]
+//       CHECK:         %[[T13:.*]] = dim %[[ARG2]], %[[C0]]
+//       CHECK:         %[[T14:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T13]]]
+//       CHECK:         %[[T15:.*]] = dim %[[ARG2]], %[[C1]]
+//       CHECK:         %[[T16:.*]] = affine.min #[[MAP4]](%[[ARG4]])[%[[T15]]]
+//       CHECK:         %[[T17:.*]] = dim %[[ARG2]], %[[C2]]
+//       CHECK:         %[[T18:.*]] = affine.min #[[MAP5]](%[[ARG5]])[%[[T17]]]
+//       CHECK:         %[[T19:.*]] = dim %[[ARG2]], %[[C3]]
+//       CHECK:         %[[SV2:.*]] = subview %[[ARG2]][%[[ARG3]], %[[ARG4]], %[[ARG5]], 0]
+//  CHECK-SAME:                                        [%[[T14]], %[[T16]], %[[T18]], %[[T19]]]
+//       CHECK:         linalg.conv(%[[ARG0]], %[[SV1]], %[[SV2]])
\ No newline at end of file