diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 2d30c2d49419b1..0e2fb9f5f3347d 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -158,12 +158,6 @@ static void dfsan_check_label(dfsan_label label) { } } -static void ReportUnsupportedFast16(const char *func) { - Report("FATAL: DataFlowSanitizer: %s is unsupported in fast16labels mode\n", - func); - Die(); -} - // Resolves the union of two unequal labels. Nonequality is a precondition for // this function (the instrumentation pass inlines the equality test). extern "C" SANITIZER_INTERFACE_ATTRIBUTE @@ -259,10 +253,8 @@ dfsan_union(dfsan_label l1, dfsan_label l2) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label dfsan_create_label(const char *desc, void *userdata) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_create_label"); dfsan_label label = - atomic_fetch_add(&__dfsan_last_label, 1, memory_order_relaxed) + 1; + atomic_fetch_add(&__dfsan_last_label, 1, memory_order_relaxed) + 1; dfsan_check_label(label); __dfsan_label_info[label].l1 = __dfsan_label_info[label].l2 = 0; __dfsan_label_info[label].desc = desc; @@ -319,15 +311,11 @@ dfsan_read_label(const void *addr, uptr size) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_get_label_info"); return &__dfsan_label_info[label]; } extern "C" SANITIZER_INTERFACE_ATTRIBUTE int dfsan_has_label(dfsan_label label, dfsan_label elem) { - if (flags().fast16labels) - return label & elem; if (label == elem) return true; const dfsan_label_info *info = dfsan_get_label_info(label); @@ -340,8 +328,6 @@ dfsan_has_label(dfsan_label label, dfsan_label elem) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE dfsan_label dfsan_has_label_with_desc(dfsan_label label, const char *desc) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_has_label_with_desc"); const dfsan_label_info *info = dfsan_get_label_info(label); if (info->l1 != 0) { return dfsan_has_label_with_desc(info->l1, desc) || @@ -361,11 +347,9 @@ dfsan_get_label_count(void) { extern "C" SANITIZER_INTERFACE_ATTRIBUTE void dfsan_dump_labels(int fd) { - if (flags().fast16labels) - ReportUnsupportedFast16("dfsan_dump_labels"); - dfsan_label last_label = atomic_load(&__dfsan_last_label, memory_order_relaxed); + for (uptr l = 1; l <= last_label; ++l) { char buf[64]; internal_snprintf(buf, sizeof(buf), "%u %u %u ", l, diff --git a/compiler-rt/lib/fuzzer/FuzzerDefs.h b/compiler-rt/lib/fuzzer/FuzzerDefs.h index 1a2752af2f4d56..fb9eccd46324c1 100644 --- a/compiler-rt/lib/fuzzer/FuzzerDefs.h +++ b/compiler-rt/lib/fuzzer/FuzzerDefs.h @@ -20,6 +20,7 @@ #include #include +#include "FuzzerPlatform.h" namespace fuzzer { @@ -62,7 +63,8 @@ typedef Vector Unit; typedef Vector UnitVector; typedef int (*UserCallback)(const uint8_t *Data, size_t Size); -int FuzzerDriver(int *argc, char ***argv, UserCallback Callback); +ATTRIBUTE_INTERFACE int FuzzerDriver(int *argc, char ***argv, + UserCallback Callback); uint8_t *ExtraCountersBegin(); uint8_t *ExtraCountersEnd(); diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index ae085befc4f152..08b12bd5b6e6b4 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -196,6 +196,10 @@ class Allocator { #endif // GWP_ASAN_HOOKS } + ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) { + TSDRegistry.initThreadMaybe(this, MinimalInit); + } + void reset() { memset(this, 0, sizeof(*this)); } void unmapTestOnly() { @@ -977,10 +981,6 @@ class Allocator { reinterpret_cast(Ptr) - SizeOrUnusedBytes; } - ALWAYS_INLINE void initThreadMaybe(bool MinimalInit = false) { - TSDRegistry.initThreadMaybe(this, MinimalInit); - } - void quarantineOrDeallocateChunk(void *Ptr, Chunk::UnpackedHeader *Header, uptr Size) { Chunk::UnpackedHeader NewHeader = *Header; diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index 7e04afb90bb1f1..005decdaa82904 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -67,15 +67,17 @@ void checkMemoryTaggingMaybe(AllocatorT *Allocator, void *P, scudo::uptr Size, ""); } +template struct TestAllocator : scudo::Allocator { + TestAllocator() { + this->reset(); + this->initThreadMaybe(); + } + ~TestAllocator() { this->unmapTestOnly(); } +}; + template static void testAllocator() { - using AllocatorT = scudo::Allocator; - auto Deleter = [](AllocatorT *A) { - A->unmapTestOnly(); - delete A; - }; - std::unique_ptr Allocator(new AllocatorT, - Deleter); - Allocator->reset(); + using AllocatorT = TestAllocator; + auto Allocator = std::make_unique(); EXPECT_FALSE(Allocator->isOwned(&Mutex)); EXPECT_FALSE(Allocator->isOwned(&Allocator)); @@ -348,14 +350,8 @@ template static void stressAllocator(AllocatorT *A) { } template static void testAllocatorThreaded() { - using AllocatorT = scudo::Allocator; - auto Deleter = [](AllocatorT *A) { - A->unmapTestOnly(); - delete A; - }; - std::unique_ptr Allocator(new AllocatorT, - Deleter); - Allocator->reset(); + using AllocatorT = TestAllocator; + auto Allocator = std::make_unique(); std::thread Threads[32]; for (scudo::uptr I = 0; I < ARRAY_SIZE(Threads); I++) Threads[I] = std::thread(stressAllocator, Allocator.get()); @@ -401,14 +397,8 @@ struct DeathConfig { }; TEST(ScudoCombinedTest, DeathCombined) { - using AllocatorT = scudo::Allocator; - auto Deleter = [](AllocatorT *A) { - A->unmapTestOnly(); - delete A; - }; - std::unique_ptr Allocator(new AllocatorT, - Deleter); - Allocator->reset(); + using AllocatorT = TestAllocator; + auto Allocator = std::make_unique(); const scudo::uptr Size = 1000U; void *P = Allocator->allocate(Size, Origin); @@ -442,14 +432,8 @@ TEST(ScudoCombinedTest, DeathCombined) { // Ensure that releaseToOS can be called prior to any other allocator // operation without issue. TEST(ScudoCombinedTest, ReleaseToOS) { - using AllocatorT = scudo::Allocator; - auto Deleter = [](AllocatorT *A) { - A->unmapTestOnly(); - delete A; - }; - std::unique_ptr Allocator(new AllocatorT, - Deleter); - Allocator->reset(); + using AllocatorT = TestAllocator; + auto Allocator = std::make_unique(); Allocator->releaseToOS(); } @@ -457,14 +441,8 @@ TEST(ScudoCombinedTest, ReleaseToOS) { // Verify that when a region gets full, the allocator will still manage to // fulfill the allocation through a larger size class. TEST(ScudoCombinedTest, FullRegion) { - using AllocatorT = scudo::Allocator; - auto Deleter = [](AllocatorT *A) { - A->unmapTestOnly(); - delete A; - }; - std::unique_ptr Allocator(new AllocatorT, - Deleter); - Allocator->reset(); + using AllocatorT = TestAllocator; + auto Allocator = std::make_unique(); std::vector V; scudo::uptr FailedAllocationsCount = 0; diff --git a/compiler-rt/test/dfsan/fast16labels.c b/compiler-rt/test/dfsan/fast16labels.c index 90c039234c336a..a9b71603954045 100644 --- a/compiler-rt/test/dfsan/fast16labels.c +++ b/compiler-rt/test/dfsan/fast16labels.c @@ -1,13 +1,4 @@ -// RUN: %clang_dfsan %s -o %t -// RUN: DFSAN_OPTIONS=fast16labels=1 %run %t -// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_create_label 2>&1 \ -// RUN: | FileCheck %s --check-prefix=CREATE-LABEL -// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_get_label_info 2>&1 \ -// RUN: | FileCheck %s --check-prefix=GET-LABEL-INFO -// RUN: DFSAN_OPTIONS=fast16labels=1 not %run %t dfsan_has_label_with_desc \ -// RUN: 2>&1 | FileCheck %s --check-prefix=HAS-LABEL-WITH-DESC -// RUN: DFSAN_OPTIONS=fast16labels=1:dump_labels_at_exit=/dev/stdout not %run \ -// RUN: %t 2>&1 | FileCheck %s --check-prefix=DUMP-LABELS +// RUN: %clang_dfsan %s -o %t && DFSAN_OPTIONS=fast16labels=1 %run %t // // Tests DFSAN_OPTIONS=fast16labels=1 // @@ -15,45 +6,20 @@ #include #include -#include int foo(int a, int b) { return a + b; } -int main(int argc, char *argv[]) { - // Death tests for unsupported API usage. - const char *command = (argc < 2) ? "" : argv[1]; - fprintf(stderr, "Running with command %s\n", command); - // CREATE-LABEL: FATAL: DataFlowSanitizer: dfsan_create_label is unsupported - if (strcmp(command, "dfsan_create_label") == 0) - dfsan_create_label("", NULL); - // GET-LABEL-INFO: FATAL: DataFlowSanitizer: dfsan_get_label_info is unsupported - if (strcmp(command, "dfsan_get_label_info") == 0) - dfsan_get_label_info(1); - // HAS-LABEL-WITH-DESC: FATAL: DataFlowSanitizer: dfsan_has_label_with_desc is unsupported - if (strcmp(command, "dfsan_has_label_with_desc") == 0) - dfsan_has_label_with_desc(1, ""); - // DUMP-LABELS: FATAL: DataFlowSanitizer: dfsan_dump_labels is unsupported - - // Supported usage. +int main() { int a = 10; int b = 20; dfsan_set_label(8, &a, sizeof(a)); dfsan_set_label(512, &b, sizeof(b)); int c = foo(a, b); - fprintf(stderr, "A: 0x%x\n", dfsan_get_label(a)); - fprintf(stderr, "B: 0x%x\n", dfsan_get_label(b)); + printf("A: 0x%x\n", dfsan_get_label(a)); + printf("B: 0x%x\n", dfsan_get_label(b)); dfsan_label l = dfsan_get_label(c); - fprintf(stderr, "C: 0x%x\n", l); - fprintf(stderr, "Testing l == 520\n"); + printf("C: 0x%x\n", l); assert(l == 520); // OR of the other two labels. - fprintf(stderr, "Testing dfsan_has_label(l, 8)\n"); - assert(dfsan_has_label(l, 8)); - fprintf(stderr, "Testing dfsan_has_label(l, 512)\n"); - assert(dfsan_has_label(l, 512)); - fprintf(stderr, "Testing !dfsan_has_label(l, 1)\n"); - assert(!dfsan_has_label(l, 1)); - fprintf(stderr, "returning...\n"); - return 0; } diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h index 0148e78fed09c2..a3e6f177f91de6 100644 --- a/flang/include/flang/Lower/OpenACC.h +++ b/flang/include/flang/Lower/OpenACC.h @@ -5,6 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// #ifndef FORTRAN_LOWER_OPENACC_H #define FORTRAN_LOWER_OPENACC_H diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 805062eb297f5c..7202d4ec031991 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -5,6 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/ +// +//===----------------------------------------------------------------------===// #include "flang/Lower/OpenACC.h" #include "flang/Lower/Bridge.h" diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index 4a0aa28c9d2b2c..6a1ff3bd64a9ba 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -32,6 +32,9 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.floor libc.src.math.floorf libc.src.math.floorl + libc.src.math.fmax + libc.src.math.fmaxf + libc.src.math.fmaxl libc.src.math.fmin libc.src.math.fminf libc.src.math.fminl diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index 7fc199eabc6bf0..1ec1a024f85d09 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -163,6 +163,9 @@ def MathAPI : PublicAPI<"math.h"> { "floor", "floorf", "floorl", + "fmax", + "fmaxf", + "fmaxl", "fmin", "fminf", "fminl", diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 04acfb31da04bd..b20f58c4518471 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -68,6 +68,9 @@ set(TARGET_LIBM_ENTRYPOINTS libc.src.math.fmin libc.src.math.fminf libc.src.math.fminl + libc.src.math.fmax + libc.src.math.fmaxf + libc.src.math.fmaxl libc.src.math.frexp libc.src.math.frexpf libc.src.math.frexpl diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index cdeee89be50713..6a11b002d87429 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -209,6 +209,10 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"fminf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"fminl", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"fmax", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"fmaxf", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"fmaxl", RetValSpec, [ArgSpec, ArgSpec]>, + FunctionSpec<"frexp", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpf", RetValSpec, [ArgSpec, ArgSpec]>, FunctionSpec<"frexpl", RetValSpec, [ArgSpec, ArgSpec]>, diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index d2d694807168ea..da18aeba9a2a50 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -449,3 +449,39 @@ add_entrypoint_object( COMPILE_OPTIONS -O2 ) + +add_entrypoint_object( + fmax + SRCS + fmax.cpp + HDRS + fmax.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) + +add_entrypoint_object( + fmaxf + SRCS + fmaxf.cpp + HDRS + fmaxf.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) + +add_entrypoint_object( + fmaxl + SRCS + fmaxl.cpp + HDRS + fmaxl.h + DEPENDS + libc.utils.FPUtil.fputil + COMPILE_OPTIONS + -O2 +) diff --git a/libc/src/math/fmax.cpp b/libc/src/math/fmax.cpp new file mode 100644 index 00000000000000..ba5d93388998eb --- /dev/null +++ b/libc/src/math/fmax.cpp @@ -0,0 +1,18 @@ +//===-- Implementation of fmax function -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/common.h" +#include "utils/FPUtil/BasicOperations.h" + +namespace __llvm_libc { + +double LLVM_LIBC_ENTRYPOINT(fmax)(double x, double y) { + return fputil::fmax(x, y); +} + +} // namespace __llvm_libc diff --git a/libc/src/math/fmax.h b/libc/src/math/fmax.h new file mode 100644 index 00000000000000..9f057983d28bb7 --- /dev/null +++ b/libc/src/math/fmax.h @@ -0,0 +1,18 @@ +//===-- Implementation header for fmax --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAX_H +#define LLVM_LIBC_SRC_MATH_FMAX_H + +namespace __llvm_libc { + +double fmax(double x, double y); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_FMAX_H diff --git a/libc/src/math/fmaxf.cpp b/libc/src/math/fmaxf.cpp new file mode 100644 index 00000000000000..55629040eba665 --- /dev/null +++ b/libc/src/math/fmaxf.cpp @@ -0,0 +1,18 @@ +//===-- Implementation of fmaxf function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/common.h" +#include "utils/FPUtil/BasicOperations.h" + +namespace __llvm_libc { + +float LLVM_LIBC_ENTRYPOINT(fmaxf)(float x, float y) { + return fputil::fmax(x, y); +} + +} // namespace __llvm_libc diff --git a/libc/src/math/fmaxf.h b/libc/src/math/fmaxf.h new file mode 100644 index 00000000000000..e37df5cf9565d4 --- /dev/null +++ b/libc/src/math/fmaxf.h @@ -0,0 +1,18 @@ +//===-- Implementation header for fmaxf -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXF_H +#define LLVM_LIBC_SRC_MATH_FMAXF_H + +namespace __llvm_libc { + +float fmaxf(float x, float y); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_FMAXF_H diff --git a/libc/src/math/fmaxl.cpp b/libc/src/math/fmaxl.cpp new file mode 100644 index 00000000000000..c944187f26b7a6 --- /dev/null +++ b/libc/src/math/fmaxl.cpp @@ -0,0 +1,18 @@ +//===-- Implementation of fmaxl function ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/common.h" +#include "utils/FPUtil/BasicOperations.h" + +namespace __llvm_libc { + +long double LLVM_LIBC_ENTRYPOINT(fmaxl)(long double x, long double y) { + return fputil::fmax(x, y); +} + +} // namespace __llvm_libc diff --git a/libc/src/math/fmaxl.h b/libc/src/math/fmaxl.h new file mode 100644 index 00000000000000..41d80ba4aa52ca --- /dev/null +++ b/libc/src/math/fmaxl.h @@ -0,0 +1,18 @@ +//===-- Implementation header for fmaxl -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXL_H +#define LLVM_LIBC_SRC_MATH_FMAXL_H + +namespace __llvm_libc { + +long double fmaxl(long double x, long double y); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MATH_FMAXL_H diff --git a/libc/src/math/fminf.cpp b/libc/src/math/fminf.cpp index 8f5b54873f8f50..97ad399f4a2bcf 100644 --- a/libc/src/math/fminf.cpp +++ b/libc/src/math/fminf.cpp @@ -15,4 +15,4 @@ float LLVM_LIBC_ENTRYPOINT(fminf)(float x, float y) { return fputil::fmin(x, y); } -} // namespace __llvm_libc \ No newline at end of file +} // namespace __llvm_libc diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index a6898b3706213f..3bd40f6b32f14a 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -472,4 +472,40 @@ add_math_unittest( libc.include.math libc.src.math.fminl libc.utils.FPUtil.fputil -) \ No newline at end of file +) + +add_math_unittest( + fmaxf_test + SUITE + libc_math_unittests + SRCS + fmaxf_test.cpp + DEPENDS + libc.include.math + libc.src.math.fmaxf + libc.utils.FPUtil.fputil +) + +add_math_unittest( + fmax_test + SUITE + libc_math_unittests + SRCS + fmax_test.cpp + DEPENDS + libc.include.math + libc.src.math.fmax + libc.utils.FPUtil.fputil +) + +add_math_unittest( + fmaxl_test + SUITE + libc_math_unittests + SRCS + fmaxl_test.cpp + DEPENDS + libc.include.math + libc.src.math.fmaxl + libc.utils.FPUtil.fputil +) diff --git a/libc/test/src/math/fmax_test.cpp b/libc/test/src/math/fmax_test.cpp new file mode 100644 index 00000000000000..8d1c6c2763821a --- /dev/null +++ b/libc/test/src/math/fmax_test.cpp @@ -0,0 +1,73 @@ +//===-- Unittests for fmax -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/fmax.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/UnitTest/Test.h" + +using FPBits = __llvm_libc::fputil::FPBits; + +double nan = FPBits::buildNaN(1); +double inf = FPBits::inf(); +double negInf = FPBits::negInf(); + +TEST(FmaxTest, NaNArg) { + EXPECT_EQ(inf, __llvm_libc::fmax(nan, inf)); + EXPECT_EQ(negInf, __llvm_libc::fmax(negInf, nan)); + EXPECT_EQ(0.0, __llvm_libc::fmax(nan, 0.0)); + EXPECT_EQ(-0.0, __llvm_libc::fmax(-0.0, nan)); + EXPECT_EQ(-1.2345, __llvm_libc::fmax(nan, -1.2345)); + EXPECT_EQ(1.2345, __llvm_libc::fmax(1.2345, nan)); + EXPECT_NE(isnan(__llvm_libc::fmax(nan, nan)), 0); +} + +TEST(FmaxTest, InfArg) { + EXPECT_EQ(inf, __llvm_libc::fmax(negInf, inf)); + EXPECT_EQ(inf, __llvm_libc::fmax(inf, 0.0)); + EXPECT_EQ(inf, __llvm_libc::fmax(-0.0, inf)); + EXPECT_EQ(inf, __llvm_libc::fmax(inf, 1.2345)); + EXPECT_EQ(inf, __llvm_libc::fmax(-1.2345, inf)); +} + +TEST(FmaxTest, NegInfArg) { + EXPECT_EQ(inf, __llvm_libc::fmax(inf, negInf)); + EXPECT_EQ(0.0, __llvm_libc::fmax(negInf, 0.0)); + EXPECT_EQ(-0.0, __llvm_libc::fmax(-0.0, negInf)); + EXPECT_EQ(-1.2345, __llvm_libc::fmax(negInf, -1.2345)); + EXPECT_EQ(1.2345, __llvm_libc::fmax(1.2345, negInf)); +} + +TEST(FmaxTest, BothZero) { + EXPECT_EQ(0.0, __llvm_libc::fmax(0.0, 0.0)); + EXPECT_EQ(0.0, __llvm_libc::fmax(-0.0, 0.0)); + EXPECT_EQ(0.0, __llvm_libc::fmax(0.0, -0.0)); + EXPECT_EQ(-0.0, __llvm_libc::fmax(-0.0, -0.0)); +} + +TEST(FmaxTest, InDoubleRange) { + using UIntType = FPBits::UIntType; + constexpr UIntType count = 10000001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0, w = UIntType(-1); i <= count; + ++i, v += step, w -= step) { + double x = FPBits(v), y = FPBits(w); + if (isnan(x) || isinf(x)) + continue; + if (isnan(y) || isinf(y)) + continue; + if ((x == 0) && (y == 0)) + continue; + + if (x > y) { + ASSERT_EQ(x, __llvm_libc::fmax(x, y)); + } else { + ASSERT_EQ(y, __llvm_libc::fmax(x, y)); + } + } +} diff --git a/libc/test/src/math/fmaxf_test.cpp b/libc/test/src/math/fmaxf_test.cpp new file mode 100644 index 00000000000000..fe9eaad171bd28 --- /dev/null +++ b/libc/test/src/math/fmaxf_test.cpp @@ -0,0 +1,73 @@ +//===-- Unittests for fmaxf -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/fmaxf.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/UnitTest/Test.h" + +using FPBits = __llvm_libc::fputil::FPBits; + +float nan = FPBits::buildNaN(1); +float inf = FPBits::inf(); +float negInf = FPBits::negInf(); + +TEST(FmaxfTest, NaNArg) { + EXPECT_EQ(inf, __llvm_libc::fmaxf(nan, inf)); + EXPECT_EQ(negInf, __llvm_libc::fmaxf(negInf, nan)); + EXPECT_EQ(0.0f, __llvm_libc::fmaxf(nan, 0.0f)); + EXPECT_EQ(-0.0f, __llvm_libc::fmaxf(-0.0f, nan)); + EXPECT_EQ(-1.2345f, __llvm_libc::fmaxf(nan, -1.2345f)); + EXPECT_EQ(1.2345f, __llvm_libc::fmaxf(1.2345f, nan)); + EXPECT_NE(isnan(__llvm_libc::fmaxf(nan, nan)), 0); +} + +TEST(FmaxfTest, InfArg) { + EXPECT_EQ(inf, __llvm_libc::fmaxf(negInf, inf)); + EXPECT_EQ(inf, __llvm_libc::fmaxf(inf, 0.0f)); + EXPECT_EQ(inf, __llvm_libc::fmaxf(-0.0f, inf)); + EXPECT_EQ(inf, __llvm_libc::fmaxf(inf, 1.2345f)); + EXPECT_EQ(inf, __llvm_libc::fmaxf(-1.2345f, inf)); +} + +TEST(FmaxfTest, NegInfArg) { + EXPECT_EQ(inf, __llvm_libc::fmaxf(inf, negInf)); + EXPECT_EQ(0.0f, __llvm_libc::fmaxf(negInf, 0.0f)); + EXPECT_EQ(-0.0f, __llvm_libc::fmaxf(-0.0f, negInf)); + EXPECT_EQ(-1.2345f, __llvm_libc::fmaxf(negInf, -1.2345f)); + EXPECT_EQ(1.2345f, __llvm_libc::fmaxf(1.2345f, negInf)); +} + +TEST(FmaxfTest, BothZero) { + EXPECT_EQ(0.0f, __llvm_libc::fmaxf(0.0f, 0.0f)); + EXPECT_EQ(0.0f, __llvm_libc::fmaxf(-0.0f, 0.0f)); + EXPECT_EQ(0.0f, __llvm_libc::fmaxf(0.0f, -0.0f)); + EXPECT_EQ(-0.0f, __llvm_libc::fmaxf(-0.0f, -0.0f)); +} + +TEST(FmaxfTest, InFloatRange) { + using UIntType = FPBits::UIntType; + constexpr UIntType count = 10000001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0, w = UIntType(-1); i <= count; + ++i, v += step, w -= step) { + float x = FPBits(v), y = FPBits(w); + if (isnan(x) || isinf(x)) + continue; + if (isnan(y) || isinf(y)) + continue; + if ((x == 0) && (y == 0)) + continue; + + if (x > y) { + ASSERT_EQ(x, __llvm_libc::fmaxf(x, y)); + } else { + ASSERT_EQ(y, __llvm_libc::fmaxf(x, y)); + } + } +} diff --git a/libc/test/src/math/fmaxl_test.cpp b/libc/test/src/math/fmaxl_test.cpp new file mode 100644 index 00000000000000..9c7aa21582ebc7 --- /dev/null +++ b/libc/test/src/math/fmaxl_test.cpp @@ -0,0 +1,73 @@ +//===-- Unittests for fmaxl -----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#include "include/math.h" +#include "src/math/fmaxl.h" +#include "utils/FPUtil/FPBits.h" +#include "utils/UnitTest/Test.h" + +using FPBits = __llvm_libc::fputil::FPBits; + +long double nan = FPBits::buildNaN(1); +long double inf = FPBits::inf(); +long double negInf = FPBits::negInf(); + +TEST(FmaxlTest, NaNArg) { + EXPECT_EQ(inf, __llvm_libc::fmaxl(nan, inf)); + EXPECT_EQ(negInf, __llvm_libc::fmaxl(negInf, nan)); + EXPECT_EQ(0.0L, __llvm_libc::fmaxl(nan, 0.0L)); + EXPECT_EQ(-0.0L, __llvm_libc::fmaxl(-0.0L, nan)); + EXPECT_EQ(-1.2345L, __llvm_libc::fmaxl(nan, -1.2345L)); + EXPECT_EQ(1.2345L, __llvm_libc::fmaxl(1.2345L, nan)); + EXPECT_NE(isnan(__llvm_libc::fmaxl(nan, nan)), 0); +} + +TEST(FmaxlTest, InfArg) { + EXPECT_EQ(inf, __llvm_libc::fmaxl(negInf, inf)); + EXPECT_EQ(inf, __llvm_libc::fmaxl(inf, 0.0L)); + EXPECT_EQ(inf, __llvm_libc::fmaxl(-0.0L, inf)); + EXPECT_EQ(inf, __llvm_libc::fmaxl(inf, 1.2345L)); + EXPECT_EQ(inf, __llvm_libc::fmaxl(-1.2345L, inf)); +} + +TEST(FmaxlTest, NegInfArg) { + EXPECT_EQ(inf, __llvm_libc::fmaxl(inf, negInf)); + EXPECT_EQ(0.0L, __llvm_libc::fmaxl(negInf, 0.0L)); + EXPECT_EQ(-0.0L, __llvm_libc::fmaxl(-0.0L, negInf)); + EXPECT_EQ(-1.2345L, __llvm_libc::fmaxl(negInf, -1.2345L)); + EXPECT_EQ(1.2345L, __llvm_libc::fmaxl(1.2345L, negInf)); +} + +TEST(FmaxlTest, BothZero) { + EXPECT_EQ(0.0L, __llvm_libc::fmaxl(0.0L, 0.0L)); + EXPECT_EQ(0.0L, __llvm_libc::fmaxl(-0.0L, 0.0L)); + EXPECT_EQ(0.0L, __llvm_libc::fmaxl(0.0L, -0.0L)); + EXPECT_EQ(-0.0L, __llvm_libc::fmaxl(-0.0L, -0.0L)); +} + +TEST(FmaxlTest, InLongDoubleRange) { + using UIntType = FPBits::UIntType; + constexpr UIntType count = 10000001; + constexpr UIntType step = UIntType(-1) / count; + for (UIntType i = 0, v = 0, w = UIntType(-1); i <= count; + ++i, v += step, w -= step) { + long double x = FPBits(v), y = FPBits(w); + if (isnan(x) || isinf(x)) + continue; + if (isnan(y) || isinf(y)) + continue; + if ((x == 0) && (y == 0)) + continue; + + if (x > y) { + ASSERT_EQ(x, __llvm_libc::fmaxl(x, y)); + } else { + ASSERT_EQ(y, __llvm_libc::fmaxl(x, y)); + } + } +} diff --git a/libc/test/src/math/fmin_test.cpp b/libc/test/src/math/fmin_test.cpp index c1467e4cbb9007..7fd1e8af1e510e 100644 --- a/libc/test/src/math/fmin_test.cpp +++ b/libc/test/src/math/fmin_test.cpp @@ -6,8 +6,6 @@ // //===---------------------------------------------------------------------===// -#include - #include "include/math.h" #include "src/math/fmin.h" #include "utils/FPUtil/FPBits.h" diff --git a/libc/test/src/math/fminf_test.cpp b/libc/test/src/math/fminf_test.cpp index 8d03d92f723701..dea4e7ccf52462 100644 --- a/libc/test/src/math/fminf_test.cpp +++ b/libc/test/src/math/fminf_test.cpp @@ -6,8 +6,6 @@ // //===---------------------------------------------------------------------===// -#include - #include "include/math.h" #include "src/math/fminf.h" #include "utils/FPUtil/FPBits.h" diff --git a/libc/test/src/math/fminl_test.cpp b/libc/test/src/math/fminl_test.cpp index 519c1f33fba6e6..3eabb55b0cab23 100644 --- a/libc/test/src/math/fminl_test.cpp +++ b/libc/test/src/math/fminl_test.cpp @@ -6,8 +6,6 @@ // //===---------------------------------------------------------------------===// -#include - #include "include/math.h" #include "src/math/fminl.h" #include "utils/FPUtil/FPBits.h" diff --git a/libc/utils/FPUtil/BasicOperations.h b/libc/utils/FPUtil/BasicOperations.h index 2f86ddf5678cfd..78856926af4bfb 100644 --- a/libc/utils/FPUtil/BasicOperations.h +++ b/libc/utils/FPUtil/BasicOperations.h @@ -43,6 +43,25 @@ static inline T fmin(T x, T y) { } } +template ::Value, int> = 0> +static inline T fmax(T x, T y) { + FPBits bitx(x), bity(y); + + if (bitx.isNaN()) { + return y; + } else if (bity.isNaN()) { + return x; + } else if (bitx.sign != bity.sign) { + // To make sure that fmax(+0, -0) == +0 == fmax(-0, +0), whenever x and + // y has different signs and both are not NaNs, we return the number + // with positive sign. + return (bitx.sign ? y : x); + } else { + return (x > y ? x : y); + } +} + } // namespace fputil } // namespace __llvm_libc diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 5fa3b66c99a7f7..fd877f1adaeb49 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -526,6 +526,11 @@ function(cxx_add_basic_build_flags target) CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) + # When building the dylib, don't warn for unavailable aligned allocation + # functions based on the deployment target -- they are always available + # because they are provided by the dylib itself. + target_add_compile_flags_if_supported(${target} PRIVATE -faligned-allocation) + # On all systems the system c++ standard library headers need to be excluded. # MSVC only has -X, which disables all default includes; including the crt. # Thus, we do nothing and hope we don't accidentally include any of the C++ diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 271364c5c6ec39..6b9c5c68998198 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -12258,14 +12258,172 @@ Example: call void @llvm.call.preallocated.teardown(token %cs) ret void -Standard C Library Intrinsics ------------------------------ +Standard C/C++ Library Intrinsics +--------------------------------- -LLVM provides intrinsics for a few important standard C library +LLVM provides intrinsics for a few important standard C/C++ library functions. These intrinsics allow source-language front-ends to pass information about the alignment of the pointer arguments to the code generator, providing opportunity for more efficient code generation. + +'``llvm.abs.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.abs`` on any +integer bit width or any vector of integer elements. + +:: + + declare i32 @llvm.abs.i32(i32 , i1 ) + declare <4 x i32> @llvm.abs.v4i32(<4 x i32> , i1 ) + +Overview: +""""""""" + +The '``llvm.abs``' family of intrinsic functions returns the absolute value +of an argument. + +Arguments: +"""""""""" + +The first argument is the value for which the absolute value is to be returned. +This argument may be of any integer type or a vector with integer element type. +The return type must match the first argument type. + +The second argument must be a constant and is a flag to indicate whether the +result value of the '``llvm.abs``' intrinsic is a +:ref:`poison value ` if the argument is statically or dynamically +an ``INT_MIN`` value. + +Semantics: +"""""""""" + +The '``llvm.abs``' intrinsic returns the magnitude (always positive) of the +argument or each element of a vector argument.". If the argument is ``INT_MIN``, +then the result is also ``INT_MIN`` if ``is_int_min_poison == 0`` and +``poison`` otherwise. + + +'``llvm.smax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``@llvm.smax`` on any +integer bit width or any vector of integer elements. + +:: + + declare i32 @llvm.smax.i32(i32 %a, i32 %b) + declare <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview: +""""""""" + +Return the larger of ``%a`` and ``%b`` comparing the values as signed integers. +Vector intrinsics operate on a per-element basis. The larger element of ``%a`` +and ``%b`` at a given index is returned for that index. + +Arguments: +"""""""""" + +The arguments (``%a`` and ``%b``) may be of any integer type or a vector with +integer element type. The argument types must match each other, and the return +type must match the argument type. + + +'``llvm.smin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``@llvm.smin`` on any +integer bit width or any vector of integer elements. + +:: + + declare i32 @llvm.smin.i32(i32 %a, i32 %b) + declare <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview: +""""""""" + +Return the smaller of ``%a`` and ``%b`` comparing the values as signed integers. +Vector intrinsics operate on a per-element basis. The smaller element of ``%a`` +and ``%b`` at a given index is returned for that index. + +Arguments: +"""""""""" + +The arguments (``%a`` and ``%b``) may be of any integer type or a vector with +integer element type. The argument types must match each other, and the return +type must match the argument type. + + +'``llvm.umax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``@llvm.umax`` on any +integer bit width or any vector of integer elements. + +:: + + declare i32 @llvm.umax.i32(i32 %a, i32 %b) + declare <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview: +""""""""" + +Return the larger of ``%a`` and ``%b`` comparing the values as unsigned +integers. Vector intrinsics operate on a per-element basis. The larger element +of ``%a`` and ``%b`` at a given index is returned for that index. + +Arguments: +"""""""""" + +The arguments (``%a`` and ``%b``) may be of any integer type or a vector with +integer element type. The argument types must match each other, and the return +type must match the argument type. + + +'``llvm.umin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``@llvm.umin`` on any +integer bit width or any vector of integer elements. + +:: + + declare i32 @llvm.umin.i32(i32 %a, i32 %b) + declare <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b) + +Overview: +""""""""" + +Return the smaller of ``%a`` and ``%b`` comparing the values as unsigned +integers. Vector intrinsics operate on a per-element basis. The smaller element +of ``%a`` and ``%b`` at a given index is returned for that index. + +Arguments: +"""""""""" + +The arguments (``%a`` and ``%b``) may be of any integer type or a vector with +integer element type. The argument types must match each other, and the return +type must match the argument type. + + .. _int_memcpy: '``llvm.memcpy``' Intrinsic diff --git a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h index 0163e69ac9dd46..a39c4e5413d8d2 100644 --- a/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h +++ b/llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h @@ -1,44 +1,85 @@ -//=- FunctionPropertiesAnalysis.h - Function Properties extraction -*- C++ -=// +//=- FunctionPropertiesAnalysis.h - Function Properties Analysis --*- C++ -*-=// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +// This file defines the FunctionPropertiesInfo and FunctionPropertiesAnalysis +// classes used to extract function properties. +// +//===----------------------------------------------------------------------===// #ifndef LLVM_FUNCTIONPROPERTIESANALYSIS_H_ #define LLVM_FUNCTIONPROPERTIESANALYSIS_H_ +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/PassManager.h" namespace llvm { class Function; +class FunctionPropertiesInfo { +public: + static FunctionPropertiesInfo getFunctionPropertiesInfo(const Function &F, + const LoopInfo &LI); + + void print(raw_ostream &OS) const; + + /// Number of basic blocks + int64_t BasicBlockCount = 0; + + /// Number of blocks reached from a conditional instruction, or that are + /// 'cases' of a SwitchInstr. + // FIXME: We may want to replace this with a more meaningful metric, like + // number of conditionally executed blocks: + // 'if (a) s();' would be counted here as 2 blocks, just like + // 'if (a) s(); else s2(); s3();' would. + int64_t BlocksReachedFromConditionalInstruction = 0; + + /// Number of uses of this function, plus 1 if the function is callable + /// outside the module. + int64_t Uses = 0; + + /// Number of direct calls made from this function to other functions + /// defined in this module. + int64_t DirectCallsToDefinedFunctions = 0; + + // Load Instruction Count + int64_t LoadInstCount = 0; + + // Store Instruction Count + int64_t StoreInstCount = 0; + + // Maximum Loop Depth in the Function + int64_t MaxLoopDepth = 0; + + // Number of Top Level Loops in the Function + int64_t TopLevelLoopCount = 0; +}; + +// Analysis pass class FunctionPropertiesAnalysis : public AnalysisInfoMixin { + public: static AnalysisKey Key; - struct Result { - /// Number of basic blocks - int64_t BasicBlockCount = 0; - - /// Number of blocks reached from a conditional instruction, or that are - /// 'cases' of a SwitchInstr. - // FIXME: We may want to replace this with a more meaningful metric, like - // number of conditionally executed blocks: - // 'if (a) s();' would be counted here as 2 blocks, just like - // 'if (a) s(); else s2(); s3();' would. - int64_t BlocksReachedFromConditionalInstruction = 0; - - /// Number of uses of this function, plus 1 if the function is callable - /// outside the module. - int64_t Uses = 0; - - /// Number of direct calls made from this function to other functions - /// defined in this module. - int64_t DirectCallsToDefinedFunctions = 0; - }; - Result run(const Function &F, FunctionAnalysisManager &FAM); + + using Result = FunctionPropertiesInfo; + + Result run(Function &F, FunctionAnalysisManager &FAM); +}; + +/// Printer pass for the FunctionPropertiesAnalysis results. +class FunctionPropertiesPrinterPass + : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit FunctionPropertiesPrinterPass(raw_ostream &OS) : OS(OS) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 43a8cb2a1d51c8..e694e7ad2c8348 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -107,6 +107,9 @@ class CombinerHelper { bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo); void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo); + bool matchSextTruncSextLoad(MachineInstr &MI); + bool applySextTruncSextLoad(MachineInstr &MI); + bool matchElideBrByInvertingCond(MachineInstr &MI); void applyElideBrByInvertingCond(MachineInstr &MI); bool tryElideBrByInvertingCond(MachineInstr &MI); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 4918ea876df65e..0a071464804ebe 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1039,6 +1039,25 @@ def int_udiv_fix_sat : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, ImmArg>]>; +//===------------------ Integer Min/Max/Abs Intrinsics --------------------===// +// +def int_abs : Intrinsic< + [llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty], + [IntrNoMem, IntrSpeculatable, IntrWillReturn, ImmArg>]>; + +def int_smax : Intrinsic< + [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +def int_smin : Intrinsic< + [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +def int_umax : Intrinsic< + [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +def int_umin : Intrinsic< + [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; + //===------------------------- Memory Use Markers -------------------------===// // def int_lifetime_start : Intrinsic<[], diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index eeb2761faeb9f1..40ed6be089ac8d 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -125,6 +125,12 @@ def extending_loads : GICombineRule< (apply [{ Helper.applyCombineExtendingLoads(*${root}, ${matchinfo}); }])>; def combines_for_extload: GICombineGroup<[extending_loads]>; +def sext_trunc_sextload : GICombineRule< + (defs root:$d), + (match (wip_match_opcode G_SEXT_INREG):$d, + [{ return Helper.matchSextTruncSextLoad(*${d}); }]), + (apply [{ Helper.applySextTruncSextLoad(*${d}); }])>; + def combine_indexed_load_store : GICombineRule< (defs root:$root, indexed_load_store_matchdata:$matchinfo), (match (wip_match_opcode G_LOAD, G_SEXTLOAD, G_ZEXTLOAD, G_STORE):$root, diff --git a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp index a0fc017568f2d6..3db108c94985a4 100644 --- a/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp +++ b/llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp @@ -1,4 +1,4 @@ -//===- FunctionPropertiesAnalysis.cpp - Function properties extraction ----===// +//===- FunctionPropertiesAnalysis.cpp - Function Properties Analysis ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file implements an analysis extracting function features, which may be -// used by ML-driven policies, for example. +// This file defines the FunctionPropertiesInfo and FunctionPropertiesAnalysis +// classes used to extract function properties. // //===----------------------------------------------------------------------===// @@ -16,27 +16,75 @@ using namespace llvm; -AnalysisKey FunctionPropertiesAnalysis::Key; +FunctionPropertiesInfo +FunctionPropertiesInfo::getFunctionPropertiesInfo(const Function &F, + const LoopInfo &LI) { + + FunctionPropertiesInfo FPI; + + FPI.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses(); -FunctionPropertiesAnalysis::Result -FunctionPropertiesAnalysis::run(const Function &F, - FunctionAnalysisManager &FAM) { - Result Ret; - Ret.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses(); for (const auto &BB : F) { - ++Ret.BasicBlockCount; + ++FPI.BasicBlockCount; + if (const auto *BI = dyn_cast(BB.getTerminator())) { if (BI->isConditional()) - Ret.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors(); - } else if (const auto *SI = dyn_cast(BB.getTerminator())) - Ret.BlocksReachedFromConditionalInstruction += + FPI.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors(); + } else if (const auto *SI = dyn_cast(BB.getTerminator())) { + FPI.BlocksReachedFromConditionalInstruction += (SI->getNumCases() + (nullptr != SI->getDefaultDest())); - for (const auto &I : BB) + } + + for (const auto &I : BB) { if (auto *CS = dyn_cast(&I)) { const auto *Callee = CS->getCalledFunction(); if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration()) - ++Ret.DirectCallsToDefinedFunctions; + ++FPI.DirectCallsToDefinedFunctions; + } + if (I.getOpcode() == Instruction::Load) { + ++FPI.LoadInstCount; + } else if (I.getOpcode() == Instruction::Store) { + ++FPI.StoreInstCount; } + } + // Loop Depth of the Basic Block + int64_t LoopDepth; + LoopDepth = LI.getLoopDepth(&BB); + if (FPI.MaxLoopDepth < LoopDepth) + FPI.MaxLoopDepth = LoopDepth; } - return Ret; + for (Loop *L : LI) { + ++FPI.TopLevelLoopCount; + } + return FPI; +} + +void FunctionPropertiesInfo::print(raw_ostream &OS) const { + OS << "BasicBlockCount: " << BasicBlockCount << "\n" + << "BlocksReachedFromConditionalInstruction: " + << BlocksReachedFromConditionalInstruction << "\n" + << "Uses: " << Uses << "\n" + << "DirectCallsToDefinedFunctions: " << DirectCallsToDefinedFunctions + << "\n" + << "LoadInstCount: " << LoadInstCount << "\n" + << "StoreInstCount: " << StoreInstCount << "\n" + << "MaxLoopDepth: " << MaxLoopDepth << "\n" + << "TopLevelLoopCount: " << TopLevelLoopCount << "\n\n"; +} + +AnalysisKey FunctionPropertiesAnalysis::Key; + +FunctionPropertiesInfo +FunctionPropertiesAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { + return FunctionPropertiesInfo::getFunctionPropertiesInfo( + F, FAM.getResult(F)); +} + +PreservedAnalyses +FunctionPropertiesPrinterPass::run(Function &F, FunctionAnalysisManager &AM) { + OS << "Printing analysis results of CFA for function " + << "'" << F.getName() << "':" + << "\n"; + AM.getResult(F).print(OS); + return PreservedAnalyses::all(); } \ No newline at end of file diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 32bad28d318ba9..eba1a522d4132e 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -576,6 +576,40 @@ bool CombinerHelper::dominates(const MachineInstr &DefMI, return isPredecessor(DefMI, UseMI); } +bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); + Register SrcReg = MI.getOperand(1).getReg(); + Register LoadUser = SrcReg; + + if (MRI.getType(SrcReg).isVector()) + return false; + + Register TruncSrc; + if (mi_match(SrcReg, MRI, m_GTrunc(m_Reg(TruncSrc)))) + LoadUser = TruncSrc; + + uint64_t SizeInBits = MI.getOperand(2).getImm(); + // If the source is a G_SEXTLOAD from the same bit width, then we don't + // need any extend at all, just a truncate. + if (auto *LoadMI = getOpcodeDef(TargetOpcode::G_SEXTLOAD, LoadUser, MRI)) { + const auto &MMO = **LoadMI->memoperands_begin(); + // If truncating more than the original extended value, abort. + if (TruncSrc && MRI.getType(TruncSrc).getSizeInBits() < MMO.getSizeInBits()) + return false; + if (MMO.getSizeInBits() == SizeInBits) + return true; + } + return false; +} + +bool CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG); + MachineIRBuilder MIB(MI); + MIB.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base, Register &Offset) { auto &MF = *MI.getParent()->getParent(); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index a5706958b39fa2..aa898d5a618963 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1486,6 +1486,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateBinaryOp(TargetOpcode::G_USUBSAT, CI, MIRBuilder); case Intrinsic::ssub_sat: return translateBinaryOp(TargetOpcode::G_SSUBSAT, CI, MIRBuilder); + case Intrinsic::umin: + return translateBinaryOp(TargetOpcode::G_UMIN, CI, MIRBuilder); + case Intrinsic::umax: + return translateBinaryOp(TargetOpcode::G_UMAX, CI, MIRBuilder); + case Intrinsic::smin: + return translateBinaryOp(TargetOpcode::G_SMIN, CI, MIRBuilder); + case Intrinsic::smax: + return translateBinaryOp(TargetOpcode::G_SMAX, CI, MIRBuilder); case Intrinsic::fmuladd: { const TargetMachine &TM = MF->getTarget(); const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 8dadb2833e8d9a..9d2f64b94df166 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6363,6 +6363,36 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Op1, Op2, Op3, DAG, TLI)); return; } + case Intrinsic::smax: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + setValue(&I, DAG.getNode(ISD::SMAX, sdl, Op1.getValueType(), Op1, Op2)); + return; + } + case Intrinsic::smin: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + setValue(&I, DAG.getNode(ISD::SMIN, sdl, Op1.getValueType(), Op1, Op2)); + return; + } + case Intrinsic::umax: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + setValue(&I, DAG.getNode(ISD::UMAX, sdl, Op1.getValueType(), Op1, Op2)); + return; + } + case Intrinsic::umin: { + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2 = getValue(I.getArgOperand(1)); + setValue(&I, DAG.getNode(ISD::UMIN, sdl, Op1.getValueType(), Op1, Op2)); + return; + } + case Intrinsic::abs: { + // TODO: Preserve "int min is poison" arg in SDAG? + SDValue Op1 = getValue(I.getArgOperand(0)); + setValue(&I, DAG.getNode(ISD::ABS, sdl, Op1.getValueType(), Op1)); + return; + } case Intrinsic::stacksave: { SDValue Op = getRoot(); EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index f323d37ca46adf..edaca9ebf60901 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -129,10 +129,10 @@ FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis()) FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis()) FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis()) FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis()) +FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) FUNCTION_ANALYSIS("da", DependenceAnalysis()) -FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis()) FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis()) FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis()) FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis()) @@ -234,6 +234,7 @@ FUNCTION_PASS("print", DominatorTreePrinterPass(dbgs())) FUNCTION_PASS("print", PostDominatorTreePrinterPass(dbgs())) FUNCTION_PASS("print", DemandedBitsPrinterPass(dbgs())) FUNCTION_PASS("print", DominanceFrontierPrinterPass(dbgs())) +FUNCTION_PASS("print", FunctionPropertiesPrinterPass(dbgs())) FUNCTION_PASS("print", InlineCostAnnotationPrinterPass(dbgs())) FUNCTION_PASS("print", InlineSizeEstimatorAnalysisPrinterPass(dbgs())) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 1e39db5a984a6f..b8da73de0f8c81 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -79,6 +79,6 @@ def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>; def AArch64PostLegalizerCombinerHelper : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", [erase_undef_store, combines_for_extload, - shuffle_vector_pseudos]> { + sext_trunc_sextload, shuffle_vector_pseudos]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 791b10f4917177..a72d100b130754 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1155,12 +1155,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, }); } - SmallVector InArgs; - if (!Info.OrigRet.Ty->isVoidTy()) { - LLVM_DEBUG(dbgs() << "Call return values not yet handled\n"); - return false; - } - // If we can lower as a tail call, do that instead. bool CanTailCallOpt = false; @@ -1232,9 +1226,6 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - // Now we can add the actual call instruction to the correct position. - MIRBuilder.insertInstr(MIB); - // If Callee is a reg, since it is used by a target specific // instruction, it must have a register class matching the // constraint of that instruction. @@ -1248,6 +1239,31 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 1)); } + auto OrigInsertPt = MIRBuilder.getInsertPt(); + + // Now we can add the actual call instruction to the correct position. + MIRBuilder.insertInstr(MIB); + + // Insert this now to give us an anchor point for managing the insert point. + MachineInstrBuilder CallSeqEnd = + MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN); + + SmallVector InArgs; + if (!Info.OrigRet.Ty->isVoidTy()) { + splitToValueTypes( + MIRBuilder, Info.OrigRet, InArgs, DL, Info.CallConv, false, + [&](ArrayRef Regs, Register DstReg, + LLT LLTy, LLT PartLLT, int VTSplitIdx) { + assert(DstReg == Info.OrigRet.Regs[VTSplitIdx]); + packSplitRegsToOrigType(MIRBuilder, Info.OrigRet.Regs[VTSplitIdx], + Regs, LLTy, PartLLT); + }); + } + + // Make sure the raw argument copies are inserted before the marshalling to + // the original types. + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), CallSeqEnd); + // Finally we can copy the returned value back into its virtual-register. In // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. @@ -1260,9 +1276,10 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, } uint64_t CalleePopBytes = NumBytes; - MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKDOWN) - .addImm(0) - .addImm(CalleePopBytes); + CallSeqEnd.addImm(0) + .addImm(CalleePopBytes); + // Restore the insert point to after the call sequence. + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), OrigInsertPt); return true; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0986e1efb98406..ea1d20f7387dfc 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -35,7 +35,7 @@ struct FoldCandidate { int FrameIndexToFold; }; int ShrinkOpcode; - unsigned char UseOpNo; + unsigned UseOpNo; MachineOperand::MachineOperandType Kind; bool Commuted; @@ -662,6 +662,11 @@ void SIFoldOperands::foldOperand( Use = MRI->use_begin(DestReg), E = MRI->use_end(); Use != E; Use = NextUse) { NextUse = std::next(Use); + + // There's no point trying to fold into an implicit operand. + if (Use->isImplicit()) + continue; + FoldCandidate FC = FoldCandidate(Use->getParent(), Use.getOperandNo(), &UseMI->getOperand(1)); CopyUses.push_back(FC); diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 32dc14e5ec1952..985e88dfa017f1 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -276,7 +276,7 @@ class SCCPSolver : public InstVisitor { // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. - bool isEdgeFeasible(BasicBlock *From, BasicBlock *To); + bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const; std::vector getStructLatticeValueFor(Value *V) const { std::vector StructValues; @@ -705,7 +705,7 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI, // isEdgeFeasible - Return true if the control flow edge from the 'From' basic // block to the 'To' basic block is currently feasible. -bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { +bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const { // Check if we've called markEdgeExecutable on the edge yet. (We could // be more aggressive and try to consider edges which haven't been marked // yet, but there isn't any need.) @@ -1807,39 +1807,51 @@ static void findReturnsToZap(Function &F, } } -// Update the condition for terminators that are branching on indeterminate -// values, forcing them to use a specific edge. -static void forceIndeterminateEdge(Instruction* I, SCCPSolver &Solver) { - BasicBlock *Dest = nullptr; - Constant *C = nullptr; - if (SwitchInst *SI = dyn_cast(I)) { - if (!isa(SI->getCondition())) { - // Indeterminate switch; use first case value. - Dest = SI->case_begin()->getCaseSuccessor(); - C = SI->case_begin()->getCaseValue(); - } - } else if (BranchInst *BI = dyn_cast(I)) { - if (!isa(BI->getCondition())) { - // Indeterminate branch; use false. - Dest = BI->getSuccessor(1); - C = ConstantInt::getFalse(BI->getContext()); - } - } else if (IndirectBrInst *IBR = dyn_cast(I)) { - if (!isa(IBR->getAddress()->stripPointerCasts())) { - // Indeterminate indirectbr; use successor 0. - Dest = IBR->getSuccessor(0); - C = BlockAddress::get(IBR->getSuccessor(0)); - } - } else { - llvm_unreachable("Unexpected terminator instruction"); +static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB, + DomTreeUpdater &DTU) { + SmallPtrSet FeasibleSuccessors; + bool HasNonFeasibleEdges = false; + for (BasicBlock *Succ : successors(BB)) { + if (Solver.isEdgeFeasible(BB, Succ)) + FeasibleSuccessors.insert(Succ); + else + HasNonFeasibleEdges = true; } - if (C) { - assert(Solver.isEdgeFeasible(I->getParent(), Dest) && - "Didn't find feasible edge?"); - (void)Dest; - I->setOperand(0, C); + // All edges feasible, nothing to do. + if (!HasNonFeasibleEdges) + return false; + + // SCCP can only determine non-feasible edges for br, switch and indirectbr. + Instruction *TI = BB->getTerminator(); + assert((isa(TI) || isa(TI) || + isa(TI)) && + "Terminator must be a br, switch or indirectbr"); + + if (FeasibleSuccessors.size() == 1) { + // Replace with an unconditional branch to the only feasible successor. + BasicBlock *OnlyFeasibleSuccessor = *FeasibleSuccessors.begin(); + SmallVector Updates; + bool HaveSeenOnlyFeasibleSuccessor = false; + for (BasicBlock *Succ : successors(BB)) { + if (Succ == OnlyFeasibleSuccessor && !HaveSeenOnlyFeasibleSuccessor) { + // Don't remove the edge to the only feasible successor the first time + // we see it. We still do need to remove any multi-edges to it though. + HaveSeenOnlyFeasibleSuccessor = true; + continue; + } + + Succ->removePredecessor(BB); + Updates.push_back({DominatorTree::Delete, BB, Succ}); + } + + DTU.applyUpdatesPermissive(Updates); + BranchInst::Create(OnlyFeasibleSuccessor, BB); + TI->eraseFromParent(); + } else { + llvm_unreachable("Either all successors are feasible, or exactly one is"); } + return true; } bool llvm::runIPSCCP( @@ -1952,45 +1964,11 @@ bool llvm::runIPSCCP( /*UseLLVMTrap=*/false, /*PreserveLCSSA=*/false, &DTU); - // Now that all instructions in the function are constant folded, - // use ConstantFoldTerminator to get rid of in-edges, record DT updates and - // delete dead BBs. - for (BasicBlock *DeadBB : BlocksToErase) { - // If there are any PHI nodes in this successor, drop entries for BB now. - for (Value::user_iterator UI = DeadBB->user_begin(), - UE = DeadBB->user_end(); - UI != UE;) { - // Grab the user and then increment the iterator early, as the user - // will be deleted. Step past all adjacent uses from the same user. - auto *I = dyn_cast(*UI); - do { ++UI; } while (UI != UE && *UI == I); - - // Ignore blockaddress users; BasicBlock's dtor will handle them. - if (!I) continue; - - // If we have forced an edge for an indeterminate value, then force the - // terminator to fold to that edge. - forceIndeterminateEdge(I, Solver); - BasicBlock *InstBB = I->getParent(); - bool Folded = ConstantFoldTerminator(InstBB, - /*DeleteDeadConditions=*/false, - /*TLI=*/nullptr, &DTU); - assert(Folded && - "Expect TermInst on constantint or blockaddress to be folded"); - (void) Folded; - // If we folded the terminator to an unconditional branch to another - // dead block, replace it with Unreachable, to avoid trying to fold that - // branch again. - BranchInst *BI = cast(InstBB->getTerminator()); - if (BI && BI->isUnconditional() && - !Solver.isBlockExecutable(BI->getSuccessor(0))) { - InstBB->getTerminator()->eraseFromParent(); - new UnreachableInst(InstBB->getContext(), InstBB); - } - } - // Mark dead BB for deletion. + for (BasicBlock &BB : F) + MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU); + + for (BasicBlock *DeadBB : BlocksToErase) DTU.deleteBB(DeadBB); - } for (BasicBlock &BB : F) { for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) { diff --git a/llvm/test/Analysis/FunctionPropertiesAnalysis/matmul.ll b/llvm/test/Analysis/FunctionPropertiesAnalysis/matmul.ll new file mode 100644 index 00000000000000..506914972a1042 --- /dev/null +++ b/llvm/test/Analysis/FunctionPropertiesAnalysis/matmul.ll @@ -0,0 +1,140 @@ +; RUN: opt < %s -passes='print' -disable-output 2>&1 | FileCheck %s + +define i32 @main() { +; CHECK-DAG: Printing analysis results of CFA for function 'main': + +entry: + %retval = alloca i32, align 4 + %mat1 = alloca [2 x [2 x i32]], align 16 + %mat2 = alloca [2 x [2 x i32]], align 16 + %res = alloca [2 x [2 x i32]], align 16 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %arraydecay = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat1, i64 0, i64 0 + %arraydecay1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %mat2, i64 0, i64 0 + %arraydecay2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %res, i64 0, i64 0 + call void @multiply([2 x i32]* %arraydecay, [2 x i32]* %arraydecay1, [2 x i32]* %arraydecay2) + ret i32 0 +} +; CHECK-DAG: BasicBlockCount: 1 +; CHECK-DAG: BlocksReachedFromConditionalInstruction: 0 +; CHECK-DAG: Uses: 1 +; CHECK-DAG: DirectCallsToDefinedFunctions: 1 +; CHECK-DAG: LoadInstCount: 0 +; CHECK-DAG: StoreInstCount: 1 +; CHECK-DAG: MaxLoopDepth: 0 +; CHECK-DAG: TopLevelLoopCount: 0 + +define void @multiply([2 x i32]* %mat1, [2 x i32]* %mat2, [2 x i32]* %res) { +; CHECK-DAG: Printing analysis results of CFA for function 'multiply': +entry: + %mat1.addr = alloca [2 x i32]*, align 8 + %mat2.addr = alloca [2 x i32]*, align 8 + %res.addr = alloca [2 x i32]*, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + %k = alloca i32, align 4 + store [2 x i32]* %mat1, [2 x i32]** %mat1.addr, align 8 + store [2 x i32]* %mat2, [2 x i32]** %mat2.addr, align 8 + store [2 x i32]* %res, [2 x i32]** %res.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc24, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 2 + br i1 %cmp, label %for.body, label %for.end26 + +for.body: ; preds = %for.cond + store i32 0, i32* %j, align 4 + br label %for.cond1 + +for.cond1: ; preds = %for.inc21, %for.body + %1 = load i32, i32* %j, align 4 + %cmp2 = icmp slt i32 %1, 2 + br i1 %cmp2, label %for.body3, label %for.end23 + +for.body3: ; preds = %for.cond1 + %2 = load [2 x i32]*, [2 x i32]** %res.addr, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds [2 x i32], [2 x i32]* %2, i64 %idxprom + %4 = load i32, i32* %j, align 4 + %idxprom4 = sext i32 %4 to i64 + %arrayidx5 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx, i64 0, i64 %idxprom4 + store i32 0, i32* %arrayidx5, align 4 + store i32 0, i32* %k, align 4 + br label %for.cond6 + +for.cond6: ; preds = %for.inc, %for.body3 + %5 = load i32, i32* %k, align 4 + %cmp7 = icmp slt i32 %5, 2 + br i1 %cmp7, label %for.body8, label %for.end + +for.body8: ; preds = %for.cond6 + %6 = load [2 x i32]*, [2 x i32]** %mat1.addr, align 8 + %7 = load i32, i32* %i, align 4 + %idxprom9 = sext i32 %7 to i64 + %arrayidx10 = getelementptr inbounds [2 x i32], [2 x i32]* %6, i64 %idxprom9 + %8 = load i32, i32* %k, align 4 + %idxprom11 = sext i32 %8 to i64 + %arrayidx12 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx10, i64 0, i64 %idxprom11 + %9 = load i32, i32* %arrayidx12, align 4 + %10 = load [2 x i32]*, [2 x i32]** %mat2.addr, align 8 + %11 = load i32, i32* %k, align 4 + %idxprom13 = sext i32 %11 to i64 + %arrayidx14 = getelementptr inbounds [2 x i32], [2 x i32]* %10, i64 %idxprom13 + %12 = load i32, i32* %j, align 4 + %idxprom15 = sext i32 %12 to i64 + %arrayidx16 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx14, i64 0, i64 %idxprom15 + %13 = load i32, i32* %arrayidx16, align 4 + %mul = mul nsw i32 %9, %13 + %14 = load [2 x i32]*, [2 x i32]** %res.addr, align 8 + %15 = load i32, i32* %i, align 4 + %idxprom17 = sext i32 %15 to i64 + %arrayidx18 = getelementptr inbounds [2 x i32], [2 x i32]* %14, i64 %idxprom17 + %16 = load i32, i32* %j, align 4 + %idxprom19 = sext i32 %16 to i64 + %arrayidx20 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx18, i64 0, i64 %idxprom19 + %17 = load i32, i32* %arrayidx20, align 4 + %add = add nsw i32 %17, %mul + store i32 %add, i32* %arrayidx20, align 4 + br label %for.inc + +for.inc: ; preds = %for.body8 + %18 = load i32, i32* %k, align 4 + %inc = add nsw i32 %18, 1 + store i32 %inc, i32* %k, align 4 + br label %for.cond6 + +for.end: ; preds = %for.cond6 + br label %for.inc21 + +for.inc21: ; preds = %for.end + %19 = load i32, i32* %j, align 4 + %inc22 = add nsw i32 %19, 1 + store i32 %inc22, i32* %j, align 4 + br label %for.cond1 + +for.end23: ; preds = %for.cond1 + br label %for.inc24 + +for.inc24: ; preds = %for.end23 + %20 = load i32, i32* %i, align 4 + %inc25 = add nsw i32 %20, 1 + store i32 %inc25, i32* %i, align 4 + br label %for.cond + +for.end26: ; preds = %for.cond + ret void +} + +; CHECK-DAG: BasicBlockCount: 13 +; CHECK-DAG: BlocksReachedFromConditionalInstruction: 6 +; CHECK-DAG: Uses: 2 +; CHECK-DAG: DirectCallsToDefinedFunctions: 0 +; CHECK-DAG: LoadInstCount: 21 +; CHECK-DAG: StoreInstCount: 11 +; CHECK-DAG: MaxLoopDepth: 3 +; CHECK-DAG: TopLevelLoopCount: 1 \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-trunc-sextload.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-trunc-sextload.mir new file mode 100644 index 00000000000000..616973c04ac4ad --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-trunc-sextload.mir @@ -0,0 +1,63 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- +name: test_combine_sext_trunc_of_sextload +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_sext_trunc_of_sextload + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s64) = G_SEXTLOAD [[COPY]](p0) :: (load 2) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[SEXTLOAD]](s64) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; CHECK: $w0 = COPY [[COPY1]](s32) + %0:_(p0) = COPY $x0 + %1:_(s64) = G_SEXTLOAD %0:_(p0) :: (load 2) + %2:_(s32) = G_TRUNC %1:_(s64) + %3:_(s32) = G_SEXT_INREG %2:_(s32), 16 + $w0 = COPY %3(s32) +... +--- +name: test_combine_sext_of_sextload +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0 + ; CHECK-LABEL: name: test_combine_sext_of_sextload + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load 2) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SEXTLOAD]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: $w0 = COPY [[COPY2]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_SEXTLOAD %0:_(p0) :: (load 2) + %2:_(s32) = COPY %1:_(s32) + %3:_(s32) = G_SEXT_INREG %2:_(s32), 16 + $w0 = COPY %3(s32) +... +--- +name: test_combine_sext_of_sextload_not_matching +legalized: true +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0 + ; Here we're trying to extend from a smaller value than was extended in the load. + ; CHECK-LABEL: name: test_combine_sext_of_sextload_not_matching + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[SEXTLOAD:%[0-9]+]]:_(s32) = G_SEXTLOAD [[COPY]](p0) :: (load 2) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SEXTLOAD]](s32) + ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 24 + ; CHECK: $w0 = COPY [[SEXT_INREG]](s32) + %0:_(p0) = COPY $x0 + %1:_(s32) = G_SEXTLOAD %0:_(p0) :: (load 2) + %2:_(s32) = COPY %1:_(s32) + %3:_(s32) = G_SEXT_INREG %2:_(s32), 24 + $w0 = COPY %3(s32) +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll new file mode 100644 index 00000000000000..1136afafbd4aaa --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-return-values.ll @@ -0,0 +1,2509 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare i1 @external_i1_func_void() #0 +declare zeroext i1 @external_i1_zeroext_func_void() #0 +declare signext i1 @external_i1_signext_func_void() #0 + +declare i8 @external_i8_func_void() #0 +declare zeroext i8 @external_i8_zeroext_func_void() #0 +declare signext i8 @external_i8_signext_func_void() #0 + +declare i16 @external_i16_func_void() #0 +declare <2 x i16> @external_v2i16_func_void() #0 +declare <3 x i16> @external_v3i16_func_void() #0 +declare <4 x i16> @external_v4i16_func_void() #0 +declare zeroext i16 @external_i16_zeroext_func_void() #0 +declare signext i16 @external_i16_signext_func_void() #0 + +declare i48 @external_i48_func_void() #0 +declare zeroext i48 @external_i48_zeroext_func_void() #0 +declare signext i48 @external_i48_signext_func_void() #0 + +declare i32 @external_i32_func_void() #0 +declare i64 @external_i64_func_void() #0 +declare half @external_f16_func_void() #0 +declare float @external_f32_func_void() #0 +declare double @external_f64_func_void() #0 + +declare i8 addrspace(1)* @external_p1_func_void() #0 +declare <2 x i8 addrspace(1)*> @external_v2p1_func_void() #0 + +declare i8 addrspace(3)* @external_p3_func_void() #0 +declare <2 x i8 addrspace(3)*> @external_v2p3_func_void() #0 + +declare <2 x half> @external_v2f16_func_void() #0 +declare <3 x half> @external_v3f16_func_void() #0 +declare <4 x half> @external_v4f16_func_void() #0 +declare <3 x float> @external_v3f32_func_void() #0 +declare <5 x float> @external_v5f32_func_void() #0 +declare <2 x double> @external_v2f64_func_void() #0 + +declare <2 x i32> @external_v2i32_func_void() #0 +declare <3 x i32> @external_v3i32_func_void() #0 +declare <4 x i32> @external_v4i32_func_void() #0 +declare <5 x i32> @external_v5i32_func_void() #0 +declare <8 x i32> @external_v8i32_func_void() #0 +declare <16 x i32> @external_v16i32_func_void() #0 +declare <32 x i32> @external_v32i32_func_void() #0 +declare { <32 x i32>, i32 } @external_v32i32_i32_func_void() #0 + +declare { i32, i64 } @external_i32_i64_func_void() #0 + +declare [2 x i32] @external_a2i32_func_void() #0 +declare [5 x i8] @external_a5i8_func_void() #0 + +; return value and argument +declare hidden i32 @external_i32_func_i32(i32) #0 + + +define amdgpu_kernel void @test_call_external_i32_func_i32_imm(i32 addrspace(1)* %out) #0 { + ; GCN-LABEL: name: test_call_external_i32_func_i32_imm + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 42 + ; GCN: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) + ; GCN: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[INT]](p4) :: (dereferenceable invariant load 8 from %ir.out.kernarg.offset.cast, align 16, addrspace 4) + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i32_func_i32 + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C1]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C2]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C3]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: $vgpr0 = COPY [[C]](s32) + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i32_func_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](s32), [[LOAD]](p1) :: (volatile store 4 into %ir.out.load, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i32 @external_i32_func_i32(i32 42) + store volatile i32 %val, i32 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @test_call_external_i1_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i1_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i1_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[TRUNC]](s1), [[DEF]](p1) :: (volatile store 1 into `i1 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i1 @external_i1_func_void() + store volatile i1 %val, i1 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i1_zeroext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i1_zeroext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i1_zeroext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i1_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s1) + ; GCN: G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i1 @external_i1_zeroext_func_void() + %val.ext = zext i1 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i1_signext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i1_signext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i1_signext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i1_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s1) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s1) + ; GCN: G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i1 @external_i1_signext_func_void() + %val.ext = sext i1 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i8_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i8_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i8_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i8 @external_i8_func_void() + store volatile i8 %val, i8 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i8_zeroext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i8_zeroext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i8_zeroext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i8_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC1]](s8) + ; GCN: G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i8 @external_i8_zeroext_func_void() + %val.ext = zext i8 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i8_signext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i8_signext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i8_signext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i8_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC1]](s8) + ; GCN: G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i8 @external_i8_signext_func_void() + %val.ext = sext i8 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store 2 into `i16 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i16 @external_i16_func_void() + store volatile i16 %val, i16 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i16_zeroext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i16_zeroext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i16_zeroext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i16_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[TRUNC]](s16) + ; GCN: G_STORE [[ZEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i16 @external_i16_zeroext_func_void() + %val.ext = zext i16 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i16_signext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i16_signext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i16_signext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i16_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[TRUNC]](s16) + ; GCN: G_STORE [[SEXT]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i16 @external_i16_signext_func_void() + %val.ext = sext i16 %val to i32 + store volatile i32 %val.ext, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i32 @external_i32_func_void() + store volatile i32 %val, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i48_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i48_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i48_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64) + ; GCN: G_STORE [[TRUNC]](s48), [[DEF]](p1) :: (volatile store 6 into `i48 addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i48 @external_i48_func_void() + store volatile i48 %val, i48 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i48_zeroext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i48_zeroext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i48_zeroext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_zeroext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64) + ; GCN: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s48) + ; GCN: G_STORE [[ZEXT]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i48 @external_i48_zeroext_func_void() + %ext = zext i48 %val to i64 + store volatile i64 %ext, i64 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i48_signext_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i48_signext_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i48_signext_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i48_signext_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: [[TRUNC:%[0-9]+]]:_(s48) = G_TRUNC [[MV]](s64) + ; GCN: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[TRUNC]](s48) + ; GCN: G_STORE [[SEXT]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i48 @external_i48_signext_func_void() + %ext = sext i48 %val to i64 + store volatile i64 %ext, i64 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_i64_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i64_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i64_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i64 @external_i64_func_void() + store volatile i64 %val, i64 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_p1_func_void() #0 { + ; GCN-LABEL: name: test_call_external_p1_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_p1_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_p1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: G_STORE [[MV]](p1), [[DEF]](p1) :: (volatile store 8 into `i8 addrspace(1)* addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call i8 addrspace(1)* @external_p1_func_void() + store volatile i8 addrspace(1)* %val, i8 addrspace(1)* addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v2p1_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v2p1_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2p1_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2p1_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32) + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p1>) = G_BUILD_VECTOR [[MV]](p1), [[MV1]](p1) + ; GCN: G_STORE [[BUILD_VECTOR]](<2 x p1>), [[DEF]](p1) :: (volatile store 16 into `<2 x i8 addrspace(1)*> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <2 x i8 addrspace(1)*> @external_v2p1_func_void() + store volatile <2 x i8 addrspace(1)*> %val, <2 x i8 addrspace(1)*> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_p3_func_void() #0 { + ; GCN-LABEL: name: test_call_external_p3_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_p3_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_p3_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](p3), [[DEF]](p3) :: (volatile store 4 into `i8 addrspace(3)* addrspace(3)* undef`, addrspace 3) + ; GCN: S_ENDPGM 0 + %val = call i8 addrspace(3)* @external_p3_func_void() + store volatile i8 addrspace(3)* %val, i8 addrspace(3)* addrspace(3)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v2p3_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v2p3_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p3) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2p3_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2p3_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(p3) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[COPY21]](p3), [[COPY22]](p3) + ; GCN: G_STORE [[BUILD_VECTOR]](<2 x p3>), [[DEF]](p3) :: (volatile store 8 into `<2 x i8 addrspace(3)*> addrspace(3)* undef`, addrspace 3) + ; GCN: S_ENDPGM 0 + %val = call <2 x i8 addrspace(3)*> @external_v2p3_func_void() + store volatile <2 x i8 addrspace(3)*> %val, <2 x i8 addrspace(3)*> addrspace(3)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_f16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_f16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_f16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[TRUNC]](s16), [[DEF]](p1) :: (volatile store 2 into `half addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call half @external_f16_func_void() + store volatile half %val, half addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_f32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_f32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_f32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store 4 into `float addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call float @external_f32_func_void() + store volatile float %val, float addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_f64_func_void() #0 { + ; GCN-LABEL: name: test_call_external_f64_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_f64_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_f64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: G_STORE [[MV]](s64), [[DEF]](p1) :: (volatile store 8 into `double addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call double @external_f64_func_void() + store volatile double %val, double addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v2f64_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v2f64_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2f64_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2f64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY21]](s32), [[COPY22]](s32) + ; GCN: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32) + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) + ; GCN: G_STORE [[BUILD_VECTOR]](<2 x s64>), [[DEF]](p1) :: (volatile store 16 into `<2 x double> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <2 x double> @external_v2f64_func_void() + store volatile <2 x double> %val, <2 x double> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v2i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v2i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<2 x s32>), [[DEF]](p1) :: (volatile store 8 into `<2 x i32> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <2 x i32> @external_v2i32_func_void() + store volatile <2 x i32> %val, <2 x i32> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v3i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v3i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store 12 into `<3 x i32> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <3 x i32> @external_v3i32_func_void() + store volatile <3 x i32> %val, <3 x i32> addrspace(1)* undef, align 8 + ret void +} + +define amdgpu_kernel void @test_call_external_v4i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v4i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v4i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<4 x s32>), [[DEF]](p1) :: (volatile store 16 into `<4 x i32> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <4 x i32> @external_v4i32_func_void() + store volatile <4 x i32> %val, <4 x i32> addrspace(1)* undef, align 8 + ret void +} + +define amdgpu_kernel void @test_call_external_v5i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v5i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v5i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v5i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store 20 into `<5 x i32> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <5 x i32> @external_v5i32_func_void() + store volatile <5 x i32> %val, <5 x i32> addrspace(1)* undef, align 8 + ret void +} + +define amdgpu_kernel void @test_call_external_v8i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v8i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v8i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v8i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<8 x s32>), [[DEF]](p1) :: (volatile store 32 into `<8 x i32> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <8 x i32> @external_v8i32_func_void() + store volatile <8 x i32> %val, <8 x i32> addrspace(1)* undef, align 8 + ret void +} + +define amdgpu_kernel void @test_call_external_v16i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v16i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v16i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v16i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<16 x s32>), [[DEF]](p1) :: (volatile store 64 into `<16 x i32> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <16 x i32> @external_v16i32_func_void() + store volatile <16 x i32> %val, <16 x i32> addrspace(1)* undef, align 8 + ret void +} + +define amdgpu_kernel void @test_call_external_v32i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v32i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v32i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v32i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15, implicit-def $vgpr16, implicit-def $vgpr17, implicit-def $vgpr18, implicit-def $vgpr19, implicit-def $vgpr20, implicit-def $vgpr21, implicit-def $vgpr22, implicit-def $vgpr23, implicit-def $vgpr24, implicit-def $vgpr25, implicit-def $vgpr26, implicit-def $vgpr27, implicit-def $vgpr28, implicit-def $vgpr29, implicit-def $vgpr30, implicit-def $vgpr31 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; GCN: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; GCN: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; GCN: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; GCN: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; GCN: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; GCN: [[COPY32:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; GCN: [[COPY33:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; GCN: [[COPY34:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; GCN: [[COPY35:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; GCN: [[COPY36:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; GCN: [[COPY37:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; GCN: [[COPY38:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; GCN: [[COPY39:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; GCN: [[COPY40:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; GCN: [[COPY41:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; GCN: [[COPY42:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; GCN: [[COPY43:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; GCN: [[COPY44:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; GCN: [[COPY45:%[0-9]+]]:_(s32) = COPY $vgpr24 + ; GCN: [[COPY46:%[0-9]+]]:_(s32) = COPY $vgpr25 + ; GCN: [[COPY47:%[0-9]+]]:_(s32) = COPY $vgpr26 + ; GCN: [[COPY48:%[0-9]+]]:_(s32) = COPY $vgpr27 + ; GCN: [[COPY49:%[0-9]+]]:_(s32) = COPY $vgpr28 + ; GCN: [[COPY50:%[0-9]+]]:_(s32) = COPY $vgpr29 + ; GCN: [[COPY51:%[0-9]+]]:_(s32) = COPY $vgpr30 + ; GCN: [[COPY52:%[0-9]+]]:_(s32) = COPY $vgpr31 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32), [[COPY32]](s32), [[COPY33]](s32), [[COPY34]](s32), [[COPY35]](s32), [[COPY36]](s32), [[COPY37]](s32), [[COPY38]](s32), [[COPY39]](s32), [[COPY40]](s32), [[COPY41]](s32), [[COPY42]](s32), [[COPY43]](s32), [[COPY44]](s32), [[COPY45]](s32), [[COPY46]](s32), [[COPY47]](s32), [[COPY48]](s32), [[COPY49]](s32), [[COPY50]](s32), [[COPY51]](s32), [[COPY52]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <32 x i32> @external_v32i32_func_void() + store volatile <32 x i32> %val, <32 x i32> addrspace(1)* undef, align 8 + ret void +} + +define amdgpu_kernel void @test_call_external_v2i16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v2i16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2i16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store 4 into `<2 x i16> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <2 x i16> @external_v2i16_func_void() + store volatile <2 x i16> %val, <2 x i16> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v3i16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v3i16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3i16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GCN: G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store 6 into `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <3 x i16> @external_v3i16_func_void() + store volatile <3 x i16> %val, <3 x i16> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v4i16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v4i16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v4i16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4i16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) + ; GCN: G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store 8 into `<4 x i16> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <4 x i16> @external_v4i16_func_void() + store volatile <4 x i16> %val, <4 x i16> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v2f16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v2f16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v2f16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v2f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0 + ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](<2 x s16>), [[DEF]](p1) :: (volatile store 4 into `<2 x half> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <2 x half> @external_v2f16_func_void() + store volatile <2 x half> %val, <2 x half> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v3f16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v3f16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3f16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[DEF1:%[0-9]+]]:_(<2 x s16>) = G_IMPLICIT_DEF + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>), [[DEF1]](<2 x s16>) + ; GCN: [[UV:%[0-9]+]]:_(<3 x s16>), [[UV1:%[0-9]+]]:_(<3 x s16>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>) + ; GCN: G_STORE [[UV]](<3 x s16>), [[DEF]](p1) :: (volatile store 6 into `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <3 x half> @external_v3f16_func_void() + store volatile <3 x half> %val, <3 x half> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v4f16_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v4f16_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v4f16_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v4f16_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY21]](<2 x s16>), [[COPY22]](<2 x s16>) + ; GCN: G_STORE [[CONCAT_VECTORS]](<4 x s16>), [[DEF]](p1) :: (volatile store 8 into `<4 x half> addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <4 x half> @external_v4f16_func_void() + store volatile <4 x half> %val, <4 x half> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v3f32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v3f32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v3f32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v3f32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<3 x s32>), [[DEF]](p1) :: (volatile store 12 into `<3 x float> addrspace(1)* undef`, align 16, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <3 x float> @external_v3f32_func_void() + store volatile <3 x float> %val, <3 x float> addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_v5f32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_v5f32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_v5f32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_v5f32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32) + ; GCN: G_STORE [[BUILD_VECTOR]](<5 x s32>), [[DEF]](p1) :: (volatile store 20 into `<5 x float> addrspace(1)* undef`, align 32, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call <5 x float> @external_v5f32_func_void() + store volatile <5 x float> %val, <5 x float> addrspace(1)* undef + ret void +} + + +define amdgpu_kernel void @test_call_external_i32_i64_func_void() #0 { + ; GCN-LABEL: name: test_call_external_i32_i64_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_i32_i64_func_void + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY13]], [[C]](s64) + ; GCN: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY18]], [[SHL]] + ; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; GCN: $sgpr12 = COPY [[COPY15]](s32) + ; GCN: $sgpr13 = COPY [[COPY16]](s32) + ; GCN: $sgpr14 = COPY [[COPY17]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_i32_i64_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY23]](s32), [[COPY24]](s32) + ; GCN: G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[MV]](s64), [[COPY10]](p1) :: (volatile store 8 into `i64 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call { i32, i64 } @external_i32_i64_func_void() + %val.0 = extractvalue { i32, i64 } %val, 0 + %val.1 = extractvalue { i32, i64 } %val, 1 + store volatile i32 %val.0, i32 addrspace(1)* undef + store volatile i64 %val.1, i64 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_a2i32_func_void() #0 { + ; GCN-LABEL: name: test_call_external_a2i32_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_a2i32_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_a2i32_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[COPY21]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[COPY22]](s32), [[DEF]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call [2 x i32] @external_a2i32_func_void() + %val.0 = extractvalue [2 x i32] %val, 0 + %val.1 = extractvalue [2 x i32] %val, 1 + store volatile i32 %val.0, i32 addrspace(1)* undef + store volatile i32 %val.1, i32 addrspace(1)* undef + ret void +} + +define amdgpu_kernel void @test_call_external_a5i8_func_void() #0 { + ; GCN-LABEL: name: test_call_external_a5i8_func_void + ; GCN: bb.1 (%ir-block.0): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_a5i8_func_void + ; GCN: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY12]], [[C]](s64) + ; GCN: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C1]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY17]], [[SHL]] + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C2]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; GCN: $sgpr12 = COPY [[COPY14]](s32) + ; GCN: $sgpr13 = COPY [[COPY15]](s32) + ; GCN: $sgpr14 = COPY [[COPY16]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_a5i8_func_void, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4 + ; GCN: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; GCN: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY21]](s32) + ; GCN: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC]](s16) + ; GCN: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; GCN: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY22]](s32) + ; GCN: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC2]](s16) + ; GCN: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; GCN: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[COPY23]](s32) + ; GCN: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC4]](s16) + ; GCN: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; GCN: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[COPY24]](s32) + ; GCN: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC6]](s16) + ; GCN: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; GCN: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[COPY25]](s32) + ; GCN: [[TRUNC9:%[0-9]+]]:_(s8) = G_TRUNC [[TRUNC8]](s16) + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: G_STORE [[TRUNC1]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[TRUNC3]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[TRUNC5]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[TRUNC7]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[TRUNC9]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %val = call [5 x i8] @external_a5i8_func_void() + %val.0 = extractvalue [5 x i8] %val, 0 + %val.1 = extractvalue [5 x i8] %val, 1 + %val.2 = extractvalue [5 x i8] %val, 2 + %val.3 = extractvalue [5 x i8] %val, 3 + %val.4 = extractvalue [5 x i8] %val, 4 + store volatile i8 %val.0, i8 addrspace(1)* undef + store volatile i8 %val.1, i8 addrspace(1)* undef + store volatile i8 %val.2, i8 addrspace(1)* undef + store volatile i8 %val.3, i8 addrspace(1)* undef + store volatile i8 %val.4, i8 addrspace(1)* undef + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind noinline } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll new file mode 100644 index 00000000000000..d53cfe688f53cf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call-sret.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -stop-after=irtranslator -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0 + +define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 { + ; GCN-LABEL: name: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 + ; GCN: bb.1 (%ir-block.1): + ; GCN: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; GCN: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; GCN: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; GCN: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; GCN: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; GCN: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; GCN: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; GCN: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; GCN: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; GCN: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; GCN: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 3 + ; GCN: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GCN: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; GCN: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) + ; GCN: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.0.in.val + ; GCN: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %stack.1.out.val + ; GCN: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) + ; GCN: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; GCN: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX]], [[C2]](s32) + ; GCN: G_STORE [[C]](s8), [[FRAME_INDEX]](p5) :: (store 1 into %ir.in.gep01, addrspace 5) + ; GCN: G_STORE [[C1]](s32), [[PTR_ADD]](p5) :: (store 4 into %ir.in.gep1, addrspace 5) + ; GCN: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; GCN: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 + ; GCN: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; GCN: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; GCN: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; GCN: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; GCN: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY13]], [[C3]](s64) + ; GCN: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; GCN: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; GCN: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; GCN: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; GCN: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; GCN: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; GCN: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; GCN: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C4]](s32) + ; GCN: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY18]], [[SHL]] + ; GCN: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; GCN: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; GCN: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C5]](s32) + ; GCN: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; GCN: $vgpr0 = COPY [[FRAME_INDEX1]](p5) + ; GCN: $vgpr1 = COPY [[FRAME_INDEX]](p5) + ; GCN: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) + ; GCN: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; GCN: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; GCN: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4) + ; GCN: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; GCN: $sgpr12 = COPY [[COPY15]](s32) + ; GCN: $sgpr13 = COPY [[COPY16]](s32) + ; GCN: $sgpr14 = COPY [[COPY17]](s32) + ; GCN: $vgpr31 = COPY [[OR1]](s32) + ; GCN: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; GCN: ADJCALLSTACKDOWN 0, 0, implicit-def $scc + ; GCN: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[FRAME_INDEX1]], [[C2]](s32) + ; GCN: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[FRAME_INDEX1]](p5) :: (dereferenceable load 1 from %ir.out.gep02, addrspace 5) + ; GCN: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (dereferenceable load 4 from %ir.out.gep1, addrspace 5) + ; GCN: G_STORE [[LOAD]](s8), [[DEF]](p1) :: (volatile store 1 into `i8 addrspace(1)* undef`, addrspace 1) + ; GCN: G_STORE [[LOAD1]](s32), [[COPY10]](p1) :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + ; GCN: S_ENDPGM 0 + %in.val = alloca { i8, i32 }, align 4, addrspace(5) + %out.val = alloca { i8, i32 }, align 4, addrspace(5) + %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0 + %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1 + store i8 3, i8 addrspace(5)* %in.gep0 + store i32 8, i32 addrspace(5)* %in.gep1 + call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val) + %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0 + %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1 + %out.val0 = load i8, i8 addrspace(5)* %out.gep0 + %out.val1 = load i32, i32 addrspace(5)* %out.gep1 + store volatile i8 %out.val0, i8 addrspace(1)* undef + store volatile i32 %out.val1, i32 addrspace(1)* undef + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 353566a4d3dc79..6b29697ca086e7 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -62,6 +62,7 @@ declare hidden void @external_void_func_v8i32(<8 x i32>) #0 declare hidden void @external_void_func_v16i32(<16 x i32>) #0 declare hidden void @external_void_func_v32i32(<32 x i32>) #0 declare hidden void @external_void_func_v32i32_i32(<32 x i32>, i32) #0 +declare hidden void @external_void_func_v32i32_p3_p5(<32 x i32>, i8 addrspace(3)*, i8 addrspace(5)*) #0 declare hidden void @external_void_func_v32i32_i8_i8_i16(<32 x i32>, i8, i8, i16) #0 ; Structs @@ -3647,6 +3648,110 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i8_i8_i16() #0 { ret void } +define amdgpu_kernel void @test_call_external_void_func_v32i32_p3_p5() #0 { + ; CHECK-LABEL: name: test_call_external_void_func_v32i32_p3_p5 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr16 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr15 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 + ; CHECK: [[DEF:%[0-9]+]]:_(p4) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF1]](p1) + ; CHECK: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load 8 from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4) + ; CHECK: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: (load 128 from %ir.ptr0, addrspace 1) + ; CHECK: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: (load 4 from `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1) + ; CHECK: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[COPY10]](p1) :: (load 4 from `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1) + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD1]](<32 x s32>) + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5 + ; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] + ; CHECK: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY9]](p4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY13]], [[C]](s64) + ; CHECK: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY6]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY5]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY4]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 10 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C1]](s32) + ; CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY18]], [[SHL]] + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 + ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C2]](s32) + ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: $vgpr2 = COPY [[UV2]](s32) + ; CHECK: $vgpr3 = COPY [[UV3]](s32) + ; CHECK: $vgpr4 = COPY [[UV4]](s32) + ; CHECK: $vgpr5 = COPY [[UV5]](s32) + ; CHECK: $vgpr6 = COPY [[UV6]](s32) + ; CHECK: $vgpr7 = COPY [[UV7]](s32) + ; CHECK: $vgpr8 = COPY [[UV8]](s32) + ; CHECK: $vgpr9 = COPY [[UV9]](s32) + ; CHECK: $vgpr10 = COPY [[UV10]](s32) + ; CHECK: $vgpr11 = COPY [[UV11]](s32) + ; CHECK: $vgpr12 = COPY [[UV12]](s32) + ; CHECK: $vgpr13 = COPY [[UV13]](s32) + ; CHECK: $vgpr14 = COPY [[UV14]](s32) + ; CHECK: $vgpr15 = COPY [[UV15]](s32) + ; CHECK: $vgpr16 = COPY [[UV16]](s32) + ; CHECK: $vgpr17 = COPY [[UV17]](s32) + ; CHECK: $vgpr18 = COPY [[UV18]](s32) + ; CHECK: $vgpr19 = COPY [[UV19]](s32) + ; CHECK: $vgpr20 = COPY [[UV20]](s32) + ; CHECK: $vgpr21 = COPY [[UV21]](s32) + ; CHECK: $vgpr22 = COPY [[UV22]](s32) + ; CHECK: $vgpr23 = COPY [[UV23]](s32) + ; CHECK: $vgpr24 = COPY [[UV24]](s32) + ; CHECK: $vgpr25 = COPY [[UV25]](s32) + ; CHECK: $vgpr26 = COPY [[UV26]](s32) + ; CHECK: $vgpr27 = COPY [[UV27]](s32) + ; CHECK: $vgpr28 = COPY [[UV28]](s32) + ; CHECK: $vgpr29 = COPY [[UV29]](s32) + ; CHECK: $vgpr30 = COPY [[UV30]](s32) + ; CHECK: [[COPY21:%[0-9]+]]:_(p5) = COPY $sp_reg + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C3]](s32) + ; CHECK: G_STORE [[UV31]](s32), [[PTR_ADD1]](p5) :: (store 4 into stack, align 16, addrspace 5) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C4]](s32) + ; CHECK: G_STORE [[LOAD2]](p3), [[PTR_ADD2]](p5) :: (store 4 into stack + 4, addrspace 5) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY21]], [[C5]](s32) + ; CHECK: G_STORE [[LOAD3]](p5), [[PTR_ADD3]](p5) :: (store 4 into stack + 8, align 8, addrspace 5) + ; CHECK: [[COPY22:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY22]](<4 x s32>) + ; CHECK: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; CHECK: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; CHECK: $sgpr8_sgpr9 = COPY [[PTR_ADD]](p4) + ; CHECK: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; CHECK: $sgpr12 = COPY [[COPY15]](s32) + ; CHECK: $sgpr13 = COPY [[COPY16]](s32) + ; CHECK: $sgpr14 = COPY [[COPY17]](s32) + ; CHECK: $vgpr31 = COPY [[OR1]](s32) + ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @external_void_func_v32i32_p3_p5, csr_amdgpu_highregs, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK: ADJCALLSTACKDOWN 0, 12, implicit-def $scc + ; CHECK: S_ENDPGM 0 + %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef + %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0 + %val1 = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* undef + %val2 = load i8 addrspace(5)*, i8 addrspace(5)* addrspace(1)* undef + call void @external_void_func_v32i32_p3_p5(<32 x i32> %val0, i8 addrspace(3)* %val1, i8 addrspace(5)* %val2) + ret void +} + define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; CHECK-LABEL: name: test_call_external_void_func_struct_i8_i32 ; CHECK: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll index 340392ea8d464d..28f60ca7528dbe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll @@ -1795,6 +1795,62 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1 ret void } +define void @void_func_v32i32_p3_p5_i16(<32 x i32> %arg0, i8 addrspace(3)* %arg1, i8 addrspace(5)* %arg2) #0 { + ; CHECK-LABEL: name: void_func_v32i32_p3_p5_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7 + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY $vgpr8 + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY $vgpr9 + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY $vgpr10 + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY $vgpr11 + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY $vgpr12 + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY $vgpr13 + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY $vgpr14 + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY $vgpr15 + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY $vgpr16 + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY $vgpr17 + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY $vgpr18 + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY $vgpr19 + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY $vgpr20 + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY $vgpr21 + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY $vgpr22 + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY $vgpr23 + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY $vgpr24 + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY $vgpr25 + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY $vgpr26 + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY $vgpr27 + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY $vgpr28 + ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY $vgpr29 + ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY $vgpr30 + ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY $vgpr31 + ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.1 + ; CHECK: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[FRAME_INDEX]](p5) :: (invariant load 4 from %fixed-stack.1, align 16, addrspace 5) + ; CHECK: [[FRAME_INDEX1:%[0-9]+]]:_(p5) = G_FRAME_INDEX %fixed-stack.0 + ; CHECK: [[LOAD1:%[0-9]+]]:_(p5) = G_LOAD [[FRAME_INDEX1]](p5) :: (invariant load 4 from %fixed-stack.0, addrspace 5) + ; CHECK: [[COPY32:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32), [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) + ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: [[COPY33:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) + ; CHECK: [[COPY34:%[0-9]+]]:_(p1) = COPY [[DEF]](p1) + ; CHECK: G_STORE [[BUILD_VECTOR]](<32 x s32>), [[DEF]](p1) :: (volatile store 128 into `<32 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[LOAD]](p3), [[COPY33]](p1) :: (volatile store 4 into `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1) + ; CHECK: G_STORE [[LOAD1]](p5), [[COPY34]](p1) :: (volatile store 4 into `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1) + ; CHECK: [[COPY35:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY32]] + ; CHECK: S_SETPC_B64_return [[COPY35]] + store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef + store volatile i8 addrspace(3)* %arg1, i8 addrspace(3)* addrspace(1)* undef + store volatile i8 addrspace(5)* %arg2, i8 addrspace(5)* addrspace(1)* undef + ret void +} + define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 { ; CHECK-LABEL: name: void_func_v32i32_v2i32_v2f32 ; CHECK: bb.1 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll new file mode 100644 index 00000000000000..0f84f12a12574d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll @@ -0,0 +1,104 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s + +declare i32 @llvm.umin.i32(i32, i32) +declare i32 @llvm.umax.i32(i32, i32) +declare i32 @llvm.smin.i32(i32, i32) +declare i32 @llvm.smax.i32(i32, i32) + +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) + +define i32 @test_umin_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_umin_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call i32 @llvm.umin.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i32 @test_umax_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_umax_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_u32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call i32 @llvm.umax.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i32 @test_smin_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_smin_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_i32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call i32 @llvm.smin.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i32 @test_smax_i32(i32 %a, i32 %b) { +; CHECK-LABEL: test_smax_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_i32_e32 v0, v0, v1 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call i32 @llvm.smax.i32(i32 %a, i32 %b) + ret i32 %r +} + +define <4 x i32> @test_umin_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_umin_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_u32_e32 v0, v0, v4 +; CHECK-NEXT: v_min_u32_e32 v1, v1, v5 +; CHECK-NEXT: v_min_u32_e32 v2, v2, v6 +; CHECK-NEXT: v_min_u32_e32 v3, v3, v7 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <4 x i32> @test_umax_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_umax_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_u32_e32 v0, v0, v4 +; CHECK-NEXT: v_max_u32_e32 v1, v1, v5 +; CHECK-NEXT: v_max_u32_e32 v2, v2, v6 +; CHECK-NEXT: v_max_u32_e32 v3, v3, v7 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <4 x i32> @test_smin_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_smin_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_min_i32_e32 v0, v0, v4 +; CHECK-NEXT: v_min_i32_e32 v1, v1, v5 +; CHECK-NEXT: v_min_i32_e32 v2, v2, v6 +; CHECK-NEXT: v_min_i32_e32 v3, v3, v7 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <4 x i32> @test_smax_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: test_smax_v4i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_max_i32_e32 v0, v0, v4 +; CHECK-NEXT: v_max_i32_e32 v1, v1, v5 +; CHECK-NEXT: v_max_i32_e32 v2, v2, v6 +; CHECK-NEXT: v_max_i32_e32 v3, v3, v7 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %r = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} diff --git a/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir b/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir new file mode 100644 index 00000000000000..1a8feddeea82bf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir @@ -0,0 +1,22 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s + +# We were storing fold candidate uses in an unsigned char, which this exceeds. +# The use operand overflows and the expected register operand hits the immediate 0. +# We never have more than a handful of non-implicit operands, so don't try to fold into +# implicit operands to avoid this problem. + +--- +name: op_idx_overflows_uchar +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: op_idx_overflows_uchar + ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] + %0:sreg_32 = S_MOV_B32 0 + %1:vgpr_32 = COPY %0 + S_ENDPGM 0, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1, implicit %1 +... diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll new file mode 100644 index 00000000000000..345830676abaaa --- /dev/null +++ b/llvm/test/CodeGen/X86/abs.ll @@ -0,0 +1,618 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +; The i1 parameter is not codegen-relevant right now. + +declare i8 @llvm.abs.i8(i8, i1) +declare i16 @llvm.abs.i16(i16, i1) +declare i24 @llvm.abs.i24(i24, i1) +declare i32 @llvm.abs.i32(i32, i1) +declare i64 @llvm.abs.i64(i64, i1) +declare i128 @llvm.abs.i128(i128, i1) + +declare <1 x i32> @llvm.abs.v1i32(<1 x i32>, i1) +declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1) +declare <3 x i32> @llvm.abs.v3i32(<3 x i32>, i1) +declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1) +declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) + +declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1) +declare <16 x i8> @llvm.abs.v16i8(<16 x i8>, i1) + +define i8 @test_i8(i8 %a) nounwind { +; X64-LABEL: test_i8: +; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi +; X64-NEXT: movl %edi, %ecx +; X64-NEXT: sarb $7, %cl +; X64-NEXT: leal (%rdi,%rcx), %eax +; X64-NEXT: xorb %cl, %al +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i8: +; X86: # %bb.0: +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: sarb $7, %cl +; X86-NEXT: addb %cl, %al +; X86-NEXT: xorb %cl, %al +; X86-NEXT: retl + %r = call i8 @llvm.abs.i8(i8 %a, i1 false) + ret i8 %r +} + +define i16 @test_i16(i16 %a) nounwind { +; X64-LABEL: test_i16: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: negw %ax +; X64-NEXT: cmovlw %di, %ax +; X64-NEXT: retq +; +; X86-LABEL: test_i16: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negw %ax +; X86-NEXT: cmovlw %cx, %ax +; X86-NEXT: retl + %r = call i16 @llvm.abs.i16(i16 %a, i1 false) + ret i16 %r +} + +define i24 @test_i24(i24 %a) nounwind { +; X64-LABEL: test_i24: +; X64: # %bb.0: +; X64-NEXT: shll $8, %edi +; X64-NEXT: sarl $8, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i24: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $8, %ecx +; X86-NEXT: sarl $8, %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: retl + %r = call i24 @llvm.abs.i24(i24 %a, i1 false) + ret i24 %r +} + +define i32 @test_i32(i32 %a) nounwind { +; X64-LABEL: test_i32: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: retl + %r = call i32 @llvm.abs.i32(i32 %a, i1 false) + ret i32 %r +} + +define i64 @test_i64(i64 %a) nounwind { +; X64-LABEL: test_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: negq %rax +; X64-NEXT: cmovlq %rdi, %rax +; X64-NEXT: retq +; +; X86-LABEL: test_i64: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sarl $31, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: adcl %ecx, %edx +; X86-NEXT: xorl %ecx, %edx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: retl + %r = call i64 @llvm.abs.i64(i64 %a, i1 false) + ret i64 %r +} + +define i128 @test_i128(i128 %a) nounwind { +; X64-LABEL: test_i128: +; X64: # %bb.0: +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: negq %rax +; X64-NEXT: sbbq %rsi, %rdx +; X64-NEXT: testq %rsi, %rsi +; X64-NEXT: cmovnsq %rdi, %rax +; X64-NEXT: cmovnsq %rsi, %rdx +; X64-NEXT: retq +; +; X86-LABEL: test_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: negl %edi +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: cmovnsl %eax, %esi +; X86-NEXT: cmovnsl %ecx, %ebp +; X86-NEXT: cmovnsl %edx, %ebx +; X86-NEXT: cmovnsl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %ebp, 8(%eax) +; X86-NEXT: movl %esi, 12(%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call i128 @llvm.abs.i128(i128 %a, i1 false) + ret i128 %r +} + +define <1 x i32> @test_v1i32(<1 x i32> %a) nounwind { +; X64-LABEL: test_v1i32: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: negl %eax +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_v1i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: retl + %r = call <1 x i32> @llvm.abs.v1i32(<1 x i32> %a, i1 false) + ret <1 x i32> %r +} + +define <2 x i32> @test_v2i32(<2 x i32> %a) nounwind { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpabsd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v2i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovll %edx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: negl %edx +; X86-NEXT: cmovll %ecx, %edx +; X86-NEXT: retl + %r = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %a, i1 false) + ret <2 x i32> %r +} + +define <3 x i32> @test_v3i32(<3 x i32> %a) nounwind { +; SSE-LABEL: test_v3i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vpabsd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v3i32: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovll %edx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: negl %edx +; X86-NEXT: cmovll %ecx, %edx +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: popl %esi +; X86-NEXT: retl + %r = call <3 x i32> @llvm.abs.v3i32(<3 x i32> %a, i1 false) + ret <3 x i32> %r +} + +define <4 x i32> @test_v4i32(<4 x i32> %a) nounwind { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psrad $31, %xmm1 +; SSE-NEXT: paddd %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpabsd %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v4i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: cmovll %edi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: negl %edi +; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: movl %edx, %esi +; X86-NEXT: negl %esi +; X86-NEXT: cmovll %edx, %esi +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: negl %edx +; X86-NEXT: cmovll %ecx, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl $4 + %r = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a, i1 false) + ret <4 x i32> %r +} + +define <8 x i32> @test_v8i32(<8 x i32> %a) nounwind { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: paddd %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpabsd %xmm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpabsd %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpabsd %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; X86-LABEL: test_v8i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: cmovll %edx, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: cmovll %ebp, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: negl %ebp +; X86-NEXT: cmovll %ebx, %ebp +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: negl %ebx +; X86-NEXT: cmovll %edi, %ebx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: negl %edi +; X86-NEXT: cmovll %esi, %edi +; X86-NEXT: movl %eax, %esi +; X86-NEXT: negl %esi +; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negl %eax +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: cmovll %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, 28(%edx) +; X86-NEXT: movl %eax, 24(%edx) +; X86-NEXT: movl %esi, 20(%edx) +; X86-NEXT: movl %edi, 16(%edx) +; X86-NEXT: movl %ebx, 12(%edx) +; X86-NEXT: movl %ebp, 8(%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, (%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a, i1 false) + ret <8 x i32> %r +} + +define <8 x i16> @test_v8i16(<8 x i16> %a) nounwind { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm1 +; SSE-NEXT: psraw $15, %xmm1 +; SSE-NEXT: paddw %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpabsw %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v8i16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: negw %cx +; X86-NEXT: cmovlw %dx, %cx +; X86-NEXT: movw %cx, {{[-0-9]+}}(%e{{[sb]}}p) # 2-byte Spill +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: negw %cx +; X86-NEXT: cmovlw %bp, %cx +; X86-NEXT: movw %cx, (%esp) # 2-byte Spill +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: negw %bp +; X86-NEXT: cmovlw %bx, %bp +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: negw %bx +; X86-NEXT: cmovlw %di, %bx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: negw %di +; X86-NEXT: cmovlw %si, %di +; X86-NEXT: movl %eax, %esi +; X86-NEXT: negw %si +; X86-NEXT: cmovlw %ax, %si +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: negw %ax +; X86-NEXT: cmovlw %cx, %ax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: negw %cx +; X86-NEXT: cmovlw %dx, %cx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movw %cx, 14(%edx) +; X86-NEXT: movw %ax, 12(%edx) +; X86-NEXT: movw %si, 10(%edx) +; X86-NEXT: movw %di, 8(%edx) +; X86-NEXT: movw %bx, 6(%edx) +; X86-NEXT: movw %bp, 4(%edx) +; X86-NEXT: movzwl (%esp), %eax # 2-byte Folded Reload +; X86-NEXT: movw %ax, 2(%edx) +; X86-NEXT: movzwl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 2-byte Folded Reload +; X86-NEXT: movw %ax, (%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a, i1 false) + ret <8 x i16> %r +} + +define <16 x i8> @test_v16i8(<16 x i8> %a) nounwind { +; SSE-LABEL: test_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpabsb %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v16i8: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %esi +; X86-NEXT: subl $12, %esp +; X86-NEXT: movb {{[0-9]+}}(%esp), %bh +; X86-NEXT: movb {{[0-9]+}}(%esp), %bl +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %ah +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movb %cl, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %cl +; X86-NEXT: xorb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb %dl, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %dl +; X86-NEXT: xorb %al, %dl +; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb %ah, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %ah +; X86-NEXT: xorb %al, %ah +; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb %ch, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %ch +; X86-NEXT: xorb %al, %ch +; X86-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb %dh, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %dh +; X86-NEXT: xorb %al, %dh +; X86-NEXT: movb %dh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %bl +; X86-NEXT: xorb %al, %bl +; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb %bh, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %bh +; X86-NEXT: xorb %al, %bh +; X86-NEXT: movb %bh, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %cl +; X86-NEXT: xorb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %cl +; X86-NEXT: xorb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movb {{[0-9]+}}(%esp), %bh +; X86-NEXT: movb %bh, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %bh +; X86-NEXT: xorb %al, %bh +; X86-NEXT: movb {{[0-9]+}}(%esp), %bl +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %bl +; X86-NEXT: xorb %al, %bl +; X86-NEXT: movb {{[0-9]+}}(%esp), %dh +; X86-NEXT: movb %dh, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %dh +; X86-NEXT: xorb %al, %dh +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: movb %ch, %al +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %ch +; X86-NEXT: xorb %al, %ch +; X86-NEXT: movb {{[0-9]+}}(%esp), %dl +; X86-NEXT: movl %edx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %dl +; X86-NEXT: xorb %al, %dl +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarb $7, %al +; X86-NEXT: addb %al, %cl +; X86-NEXT: xorb %al, %cl +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movb %al, %ah +; X86-NEXT: sarb $7, %ah +; X86-NEXT: addb %ah, %al +; X86-NEXT: xorb %ah, %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movb %al, 15(%esi) +; X86-NEXT: movb %cl, 14(%esi) +; X86-NEXT: movb %dl, 13(%esi) +; X86-NEXT: movb %ch, 12(%esi) +; X86-NEXT: movb %dh, 11(%esi) +; X86-NEXT: movb %bl, 10(%esi) +; X86-NEXT: movb %bh, 9(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 8(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 7(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 6(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 5(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 4(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 3(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 2(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, 1(%esi) +; X86-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Reload +; X86-NEXT: movb %al, (%esi) +; X86-NEXT: movl %esi, %eax +; X86-NEXT: addl $12, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx +; X86-NEXT: retl $4 + %r = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a, i1 false) + ret <16 x i8> %r +} diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll new file mode 100644 index 00000000000000..31d0822f8090a2 --- /dev/null +++ b/llvm/test/CodeGen/X86/smax.ll @@ -0,0 +1,662 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i8 @llvm.smax.i8(i8, i8) +declare i16 @llvm.smax.i16(i16, i16) +declare i24 @llvm.smax.i24(i24, i24) +declare i32 @llvm.smax.i32(i32, i32) +declare i64 @llvm.smax.i64(i64, i64) +declare i128 @llvm.smax.i128(i128, i128) + +declare <1 x i32> @llvm.smax.v1i32(<1 x i32>, <1 x i32>) +declare <2 x i32> @llvm.smax.v2i32(<2 x i32>, <2 x i32>) +declare <3 x i32> @llvm.smax.v3i32(<3 x i32>, <3 x i32>) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) + +declare <8 x i16> @llvm.smax.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.smax.v16i8(<16 x i8>, <16 x i8>) + +define i8 @test_i8(i8 %a, i8 %b) nounwind { +; X64-LABEL: test_i8: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpb %al, %dil +; X64-NEXT: cmovgl %edi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %r = call i8 @llvm.smax.i8(i8 %a, i8 %b) + ret i8 %r +} + +define i16 @test_i16(i16 %a, i16 %b) nounwind { +; X64-LABEL: test_i16: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpw %ax, %di +; X64-NEXT: cmovgl %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %r = call i16 @llvm.smax.i16(i16 %a, i16 %b) + ret i16 %r +} + +define i24 @test_i24(i24 %a, i24 %b) nounwind { +; X64-LABEL: test_i24: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $8, %esi +; X64-NEXT: sarl $8, %esi +; X64-NEXT: shll $8, %eax +; X64-NEXT: sarl $8, %eax +; X64-NEXT: cmpl %esi, %eax +; X64-NEXT: cmovlel %esi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i24: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $8, %ecx +; X86-NEXT: sarl $8, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: sarl $8, %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovlel %ecx, %eax +; X86-NEXT: retl + %r = call i24 @llvm.smax.i24(i24 %a, i24 %b) + ret i24 %r +} + +define i32 @test_i32(i32 %a, i32 %b) nounwind { +; X64-LABEL: test_i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovgl %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: retl + %r = call i32 @llvm.smax.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i64 @test_i64(i64 %a, i64 %b) nounwind { +; X64-LABEL: test_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: cmovgq %rdi, %rax +; X64-NEXT: retq +; +; X86-LABEL: test_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmoval %ecx, %edi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmovgl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl + %r = call i64 @llvm.smax.i64(i64 %a, i64 %b) + ret i64 %r +} + +define i128 @test_i128(i128 %a, i128 %b) nounwind { +; X64-LABEL: test_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: cmovaq %rdi, %rdx +; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: cmovgq %rdi, %rax +; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovgq %rsi, %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: retq +; +; X86-LABEL: test_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ebx, %edx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmoval %edx, %eax +; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: cmoval %edx, %ebp +; X86-NEXT: cmovel %eax, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: cmoval %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %edx, %ebp +; X86-NEXT: cmovgl %edi, %eax +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmovgl %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %esi, 4(%ecx) +; X86-NEXT: movl %ebx, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call i128 @llvm.smax.i128(i128 %a, i128 %b) + ret i128 %r +} + +define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind { +; X64-LABEL: test_v1i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovgl %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_v1i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: retl + %r = call <1 x i32> @llvm.smax.v1i32(<1 x i32> %a, <1 x i32> %b) + ret <1 x i32> %r +} + +define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v2i32: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmovgl %esi, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovgl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl + %r = call <2 x i32> @llvm.smax.v2i32(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %r +} + +define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { +; SSE-LABEL: test_v3i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v3i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: cmovgl %ebx, %eax +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmovgl %edi, %edx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmovgl %esi, %ecx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %r = call <3 x i32> @llvm.smax.v3i32(<3 x i32> %a, <3 x i32> %b) + ret <3 x i32> %r +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v4i32: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovgl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovgl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl $4 + %r = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm1, %xmm2 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; X86-LABEL: test_v8i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmovgl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovgl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovgl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: cmovgl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %ebx, 4(%edx) +; X86-NEXT: movl %ebp, (%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %r +} + +define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pmaxsw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v8i16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmovgl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmovgl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmovgl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovgl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %ax, 14(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 8(%ecx) +; X86-NEXT: movw %si, 6(%ecx) +; X86-NEXT: movw %di, 4(%ecx) +; X86-NEXT: movw %bx, 2(%ecx) +; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %r +} + +define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE-LABEL: test_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pcmpgtb %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v16i8: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $40, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmovgl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmovgl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovgl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) +; X86-NEXT: movb %bl, 13(%eax) +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movb %cl, 12(%eax) +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movb %cl, 11(%eax) +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movb %cl, 10(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 9(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 7(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 5(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 3(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 1(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: addl $40, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %r +} diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll new file mode 100644 index 00000000000000..70391534f544c1 --- /dev/null +++ b/llvm/test/CodeGen/X86/smin.ll @@ -0,0 +1,656 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i8 @llvm.smin.i8(i8, i8) +declare i16 @llvm.smin.i16(i16, i16) +declare i24 @llvm.smin.i24(i24, i24) +declare i32 @llvm.smin.i32(i32, i32) +declare i64 @llvm.smin.i64(i64, i64) +declare i128 @llvm.smin.i128(i128, i128) + +declare <1 x i32> @llvm.smin.v1i32(<1 x i32>, <1 x i32>) +declare <2 x i32> @llvm.smin.v2i32(<2 x i32>, <2 x i32>) +declare <3 x i32> @llvm.smin.v3i32(<3 x i32>, <3 x i32>) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) + +declare <8 x i16> @llvm.smin.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.smin.v16i8(<16 x i8>, <16 x i8>) + +define i8 @test_i8(i8 %a, i8 %b) nounwind { +; X64-LABEL: test_i8: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpb %al, %dil +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %r = call i8 @llvm.smin.i8(i8 %a, i8 %b) + ret i8 %r +} + +define i16 @test_i16(i16 %a, i16 %b) nounwind { +; X64-LABEL: test_i16: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpw %ax, %di +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %r = call i16 @llvm.smin.i16(i16 %a, i16 %b) + ret i16 %r +} + +define i24 @test_i24(i24 %a, i24 %b) nounwind { +; X64-LABEL: test_i24: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: shll $8, %esi +; X64-NEXT: sarl $8, %esi +; X64-NEXT: shll $8, %eax +; X64-NEXT: sarl $8, %eax +; X64-NEXT: cmpl %esi, %eax +; X64-NEXT: cmovgel %esi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i24: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: shll $8, %ecx +; X86-NEXT: sarl $8, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: shll $8, %eax +; X86-NEXT: sarl $8, %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovgel %ecx, %eax +; X86-NEXT: retl + %r = call i24 @llvm.smin.i24(i24 %a, i24 %b) + ret i24 %r +} + +define i32 @test_i32(i32 %a, i32 %b) nounwind { +; X64-LABEL: test_i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: retl + %r = call i32 @llvm.smin.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i64 @test_i64(i64 %a, i64 %b) nounwind { +; X64-LABEL: test_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: cmovlq %rdi, %rax +; X64-NEXT: retq +; +; X86-LABEL: test_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmovbl %ecx, %edi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmovll %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl + %r = call i64 @llvm.smin.i64(i64 %a, i64 %b) + ret i64 %r +} + +define i128 @test_i128(i128 %a, i128 %b) nounwind { +; X64-LABEL: test_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: cmovbq %rdi, %rdx +; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: cmovlq %rdi, %rax +; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovlq %rsi, %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: retq +; +; X86-LABEL: test_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmovbl %eax, %ebp +; X86-NEXT: cmovel %ebx, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: cmovel %ebp, %ecx +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %edx, 8(%edi) +; X86-NEXT: movl %esi, 4(%edi) +; X86-NEXT: movl %ecx, (%edi) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call i128 @llvm.smin.i128(i128 %a, i128 %b) + ret i128 %r +} + +define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind { +; X64-LABEL: test_v1i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovll %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_v1i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: retl + %r = call <1 x i32> @llvm.smin.v1i32(<1 x i32> %a, <1 x i32> %b) + ret <1 x i32> %r +} + +define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v2i32: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmovll %esi, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovll %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl + %r = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %r +} + +define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { +; SSE-LABEL: test_v3i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v3i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: cmovll %ebx, %eax +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmovll %edi, %edx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %r = call <3 x i32> @llvm.smin.v3i32(<3 x i32> %a, <3 x i32> %b) + ret <3 x i32> %r +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v4i32: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl $4 + %r = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm2, %xmm4 +; SSE-NEXT: pcmpgtd %xmm0, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm4, %xmm0 +; SSE-NEXT: movdqa %xmm3, %xmm2 +; SSE-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm2 +; SSE-NEXT: por %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; X86-LABEL: test_v8i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmovll %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: cmovll %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %ebx, 4(%edx) +; X86-NEXT: movl %ebp, (%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %r +} + +define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: pminsw %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v8i16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmovll %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmovll %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmovll %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %ax, 14(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 8(%ecx) +; X86-NEXT: movw %si, 6(%ecx) +; X86-NEXT: movw %di, 4(%ecx) +; X86-NEXT: movw %bx, 2(%ecx) +; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %r +} + +define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE-LABEL: test_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pcmpgtb %xmm0, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v16i8: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $40, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovll %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) +; X86-NEXT: movb %bl, 13(%eax) +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movb %cl, 12(%eax) +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movb %cl, 11(%eax) +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movb %cl, 10(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 9(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 7(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 5(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 3(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 1(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: addl $40, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %r +} diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll new file mode 100644 index 00000000000000..14a0248e191457 --- /dev/null +++ b/llvm/test/CodeGen/X86/umax.ll @@ -0,0 +1,668 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i8 @llvm.umax.i8(i8, i8) +declare i16 @llvm.umax.i16(i16, i16) +declare i24 @llvm.umax.i24(i24, i24) +declare i32 @llvm.umax.i32(i32, i32) +declare i64 @llvm.umax.i64(i64, i64) +declare i128 @llvm.umax.i128(i128, i128) + +declare <1 x i32> @llvm.umax.v1i32(<1 x i32>, <1 x i32>) +declare <2 x i32> @llvm.umax.v2i32(<2 x i32>, <2 x i32>) +declare <3 x i32> @llvm.umax.v3i32(<3 x i32>, <3 x i32>) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>) + +declare <8 x i16> @llvm.umax.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.umax.v16i8(<16 x i8>, <16 x i8>) + +define i8 @test_i8(i8 %a, i8 %b) nounwind { +; X64-LABEL: test_i8: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpb %al, %dil +; X64-NEXT: cmoval %edi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %r = call i8 @llvm.umax.i8(i8 %a, i8 %b) + ret i8 %r +} + +define i16 @test_i16(i16 %a, i16 %b) nounwind { +; X64-LABEL: test_i16: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpw %ax, %di +; X64-NEXT: cmoval %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %r = call i16 @llvm.umax.i16(i16 %a, i16 %b) + ret i16 %r +} + +define i24 @test_i24(i24 %a, i24 %b) nounwind { +; X64-LABEL: test_i24: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $16777215, %esi # imm = 0xFFFFFF +; X64-NEXT: andl $16777215, %eax # imm = 0xFFFFFF +; X64-NEXT: cmpl %esi, %eax +; X64-NEXT: cmovbel %esi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i24: +; X86: # %bb.0: +; X86-NEXT: movl $16777215, %eax # imm = 0xFFFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbel %ecx, %eax +; X86-NEXT: retl + %r = call i24 @llvm.umax.i24(i24 %a, i24 %b) + ret i24 %r +} + +define i32 @test_i32(i32 %a, i32 %b) nounwind { +; X64-LABEL: test_i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmoval %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: retl + %r = call i32 @llvm.umax.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i64 @test_i64(i64 %a, i64 %b) nounwind { +; X64-LABEL: test_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: cmovaq %rdi, %rax +; X64-NEXT: retq +; +; X86-LABEL: test_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmoval %ecx, %edi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmoval %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl + %r = call i64 @llvm.umax.i64(i64 %a, i64 %b) + ret i64 %r +} + +define i128 @test_i128(i128 %a, i128 %b) nounwind { +; X64-LABEL: test_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: cmovaq %rdi, %rdx +; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: cmovaq %rdi, %rax +; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovaq %rsi, %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: retq +; +; X86-LABEL: test_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %ebx, %edx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmoval %edx, %eax +; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: movl %ebx, %ebp +; X86-NEXT: cmoval %edx, %ebp +; X86-NEXT: cmovel %eax, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: cmovel %ebp, %ebx +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: cmoval %edi, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl %edx, %ebp +; X86-NEXT: cmoval %edi, %eax +; X86-NEXT: cmovel %ecx, %eax +; X86-NEXT: cmoval %ebp, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %edx, 12(%ecx) +; X86-NEXT: movl %eax, 8(%ecx) +; X86-NEXT: movl %esi, 4(%ecx) +; X86-NEXT: movl %ebx, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call i128 @llvm.umax.i128(i128 %a, i128 %b) + ret i128 %r +} + +define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind { +; X64-LABEL: test_v1i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmoval %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_v1i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: retl + %r = call <1 x i32> @llvm.umax.v1i32(<1 x i32> %a, <1 x i32> %b) + ret <1 x i32> %r +} + +define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v2i32: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmoval %esi, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmoval %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl + %r = call <2 x i32> @llvm.umax.v2i32(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %r +} + +define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { +; SSE-LABEL: test_v3i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v3i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: cmoval %ebx, %eax +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmoval %edi, %edx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmoval %esi, %ecx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %r = call <3 x i32> @llvm.umax.v3i32(<3 x i32> %a, <3 x i32> %b) + ret <3 x i32> %r +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v4i32: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmoval %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmoval %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl $4 + %r = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm5, %xmm6 +; SSE-NEXT: movdqa %xmm0, %xmm4 +; SSE-NEXT: pxor %xmm5, %xmm4 +; SSE-NEXT: pcmpgtd %xmm6, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm4 +; SSE-NEXT: por %xmm0, %xmm4 +; SSE-NEXT: movdqa %xmm3, %xmm0 +; SSE-NEXT: pxor %xmm5, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm5 +; SSE-NEXT: pcmpgtd %xmm0, %xmm5 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm5 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: movdqa %xmm4, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; X86-LABEL: test_v8i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmoval %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmoval %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmoval %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: cmoval %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %ebx, 4(%edx) +; X86-NEXT: movl %ebp, (%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %r +} + +define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pmaxsw %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v8i16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmoval %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmoval %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmoval %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmoval %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %ax, 14(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 8(%ecx) +; X86-NEXT: movw %si, 6(%ecx) +; X86-NEXT: movw %di, 4(%ecx) +; X86-NEXT: movw %bx, 2(%ecx) +; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %r +} + +define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE-LABEL: test_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pmaxub %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v16i8: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $40, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmoval %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmoval %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmoval %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) +; X86-NEXT: movb %bl, 13(%eax) +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movb %cl, 12(%eax) +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movb %cl, 11(%eax) +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movb %cl, 10(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 9(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 7(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 5(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 3(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 1(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: addl $40, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %r +} diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll new file mode 100644 index 00000000000000..234c4faf6cd2b4 --- /dev/null +++ b/llvm/test/CodeGen/X86/umin.ll @@ -0,0 +1,667 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefixes=X64,SSE +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx | FileCheck %s --check-prefixes=X64,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-linux -mattr=avx2 | FileCheck %s --check-prefixes=X64,AVX,AVX2 +; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86 + +declare i8 @llvm.umin.i8(i8, i8) +declare i16 @llvm.umin.i16(i16, i16) +declare i24 @llvm.umin.i24(i24, i24) +declare i32 @llvm.umin.i32(i32, i32) +declare i64 @llvm.umin.i64(i64, i64) +declare i128 @llvm.umin.i128(i128, i128) + +declare <1 x i32> @llvm.umin.v1i32(<1 x i32>, <1 x i32>) +declare <2 x i32> @llvm.umin.v2i32(<2 x i32>, <2 x i32>) +declare <3 x i32> @llvm.umin.v3i32(<3 x i32>, <3 x i32>) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) + +declare <8 x i16> @llvm.umin.v8i16(<8 x i16>, <8 x i16>) +declare <16 x i8> @llvm.umin.v16i8(<16 x i8>, <16 x i8>) + +define i8 @test_i8(i8 %a, i8 %b) nounwind { +; X64-LABEL: test_i8: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpb %al, %dil +; X64-NEXT: cmovbl %edi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpb %al, %cl +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl + %r = call i8 @llvm.umin.i8(i8 %a, i8 %b) + ret i8 %r +} + +define i16 @test_i16(i16 %a, i16 %b) nounwind { +; X64-LABEL: test_i16: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpw %ax, %di +; X64-NEXT: cmovbl %edi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq +; +; X86-LABEL: test_i16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl + %r = call i16 @llvm.umin.i16(i16 %a, i16 %b) + ret i16 %r +} + +define i24 @test_i24(i24 %a, i24 %b) nounwind { +; X64-LABEL: test_i24: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: andl $16777215, %esi # imm = 0xFFFFFF +; X64-NEXT: andl $16777215, %eax # imm = 0xFFFFFF +; X64-NEXT: cmpl %esi, %eax +; X64-NEXT: cmovael %esi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i24: +; X86: # %bb.0: +; X86-NEXT: movl $16777215, %eax # imm = 0xFFFFFF +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl %eax, %ecx +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovael %ecx, %eax +; X86-NEXT: retl + %r = call i24 @llvm.umin.i24(i24 %a, i24 %b) + ret i24 %r +} + +define i32 @test_i32(i32 %a, i32 %b) nounwind { +; X64-LABEL: test_i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovbl %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: retl + %r = call i32 @llvm.umin.i32(i32 %a, i32 %b) + ret i32 %r +} + +define i64 @test_i64(i64 %a, i64 %b) nounwind { +; X64-LABEL: test_i64: +; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: cmpq %rsi, %rdi +; X64-NEXT: cmovbq %rdi, %rax +; X64-NEXT: retq +; +; X86-LABEL: test_i64: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: cmovbl %ecx, %edi +; X86-NEXT: cmpl %edx, %esi +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: cmovel %edi, %eax +; X86-NEXT: cmovbl %esi, %edx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl + %r = call i64 @llvm.umin.i64(i64 %a, i64 %b) + ret i64 %r +} + +define i128 @test_i128(i128 %a, i128 %b) nounwind { +; X64-LABEL: test_i128: +; X64: # %bb.0: +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: cmpq %rdx, %rdi +; X64-NEXT: cmovbq %rdi, %rdx +; X64-NEXT: cmpq %rcx, %rsi +; X64-NEXT: cmovbq %rdi, %rax +; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmovbq %rsi, %rcx +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: retq +; +; X86-LABEL: test_i128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: cmovbl %eax, %ebp +; X86-NEXT: cmovel %ebx, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: xorl %edx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: cmovel %ebp, %ecx +; X86-NEXT: cmovel (%esp), %esi # 4-byte Folded Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: cmpl %eax, %edi +; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovbl %edi, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %eax, 12(%edi) +; X86-NEXT: movl %edx, 8(%edi) +; X86-NEXT: movl %esi, 4(%edi) +; X86-NEXT: movl %ecx, (%edi) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call i128 @llvm.umin.i128(i128 %a, i128 %b) + ret i128 %r +} + +define <1 x i32> @test_v1i32(<1 x i32> %a, <1 x i32> %b) nounwind { +; X64-LABEL: test_v1i32: +; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax +; X64-NEXT: cmpl %esi, %edi +; X64-NEXT: cmovbl %edi, %eax +; X64-NEXT: retq +; +; X86-LABEL: test_v1i32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: retl + %r = call <1 x i32> @llvm.umin.v1i32(<1 x i32> %a, <1 x i32> %b) + ret <1 x i32> %r +} + +define <2 x i32> @test_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { +; SSE-LABEL: test_v2i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v2i32: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi +; X86-NEXT: cmovbl %esi, %eax +; X86-NEXT: cmpl %edx, %ecx +; X86-NEXT: cmovbl %ecx, %edx +; X86-NEXT: popl %esi +; X86-NEXT: retl + %r = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %a, <2 x i32> %b) + ret <2 x i32> %r +} + +define <3 x i32> @test_v3i32(<3 x i32> %a, <3 x i32> %b) nounwind { +; SSE-LABEL: test_v3i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v3i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v3i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl %eax, %ebx +; X86-NEXT: cmovbl %ebx, %eax +; X86-NEXT: cmpl %edx, %edi +; X86-NEXT: cmovbl %edi, %edx +; X86-NEXT: cmpl %ecx, %esi +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: retl + %r = call <3 x i32> @llvm.umin.v3i32(<3 x i32> %a, <3 x i32> %b) + ret <3 x i32> %r +} + +define <4 x i32> @test_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind { +; SSE-LABEL: test_v4i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pxor %xmm2, %xmm3 +; SSE-NEXT: pxor %xmm1, %xmm2 +; SSE-NEXT: pcmpgtd %xmm3, %xmm2 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: pandn %xmm1, %xmm2 +; SSE-NEXT: por %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v4i32: +; X86: # %bb.0: +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovbl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: retl $4 + %r = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %a, <4 x i32> %b) + ret <4 x i32> %r +} + +define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { +; SSE-LABEL: test_v8i32: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; SSE-NEXT: movdqa %xmm0, %xmm5 +; SSE-NEXT: pxor %xmm4, %xmm5 +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: pxor %xmm4, %xmm6 +; SSE-NEXT: pcmpgtd %xmm5, %xmm6 +; SSE-NEXT: pand %xmm6, %xmm0 +; SSE-NEXT: pandn %xmm2, %xmm6 +; SSE-NEXT: por %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: pxor %xmm4, %xmm2 +; SSE-NEXT: pxor %xmm3, %xmm4 +; SSE-NEXT: pcmpgtd %xmm2, %xmm4 +; SSE-NEXT: pand %xmm4, %xmm1 +; SSE-NEXT: pandn %xmm3, %xmm4 +; SSE-NEXT: por %xmm4, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: test_v8i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; X86-LABEL: test_v8i32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebp, %eax +; X86-NEXT: cmovbl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ebx, %eax +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edi, %eax +; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: cmovbl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpl %ecx, %eax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %eax, %edx +; X86-NEXT: cmovbl %edx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %eax, 28(%edx) +; X86-NEXT: movl %ecx, 24(%edx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 20(%edx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, 16(%edx) +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %ebx, 4(%edx) +; X86-NEXT: movl %ebp, (%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %a, <8 x i32> %b) + ret <8 x i32> %r +} + +define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { +; SSE-LABEL: test_v8i16: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] +; SSE-NEXT: pxor %xmm2, %xmm1 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: pminsw %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v8i16: +; AVX: # %bb.0: +; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v8i16: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bp, %ax +; X86-NEXT: cmovbl %eax, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %bx, %ax +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %di, %ax +; X86-NEXT: cmovbl %eax, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %si, %ax +; X86-NEXT: cmovbl %eax, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %cx, %ax +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpw %dx, %ax +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: cmpw %ax, %cx +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %ax, 14(%ecx) +; X86-NEXT: movw %dx, 12(%ecx) +; X86-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 10(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movw %ax, 8(%ecx) +; X86-NEXT: movw %si, 6(%ecx) +; X86-NEXT: movw %di, 4(%ecx) +; X86-NEXT: movw %bx, 2(%ecx) +; X86-NEXT: movw %bp, (%ecx) +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %r +} + +define <16 x i8> @test_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { +; SSE-LABEL: test_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: pminub %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: test_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq +; +; X86-LABEL: test_v16i8: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $40, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %bl, %al +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %dl, %al +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: cmpb %cl, %al +; X86-NEXT: cmovbl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb %cl, 15(%eax) +; X86-NEXT: movb %dl, 14(%eax) +; X86-NEXT: movb %bl, 13(%eax) +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: movb %cl, 12(%eax) +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movb %cl, 11(%eax) +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: movb %cl, 10(%eax) +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 9(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 8(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 7(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 6(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 5(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 4(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 3(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 2(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, 1(%eax) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movb %cl, (%eax) +; X86-NEXT: addl $40, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 + %r = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %r +} diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll index 612a38f008fc55..dada59099d81be 100644 --- a/llvm/test/Transforms/SCCP/conditions-ranges.ll +++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll @@ -231,12 +231,12 @@ define void @f7_nested_conds(i32* %a, i32 %b) { ; CHECK-NEXT: [[C_1:%.*]] = icmp ne i32 [[A_V]], 0 ; CHECK-NEXT: br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] ; CHECK: false: -; CHECK-NEXT: br i1 true, label [[TRUE_2:%.*]], label [[TRUE]] +; CHECK-NEXT: br label [[TRUE_2:%.*]] ; CHECK: true.2: ; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: ret void ; CHECK: true: -; CHECK-NEXT: store i32 [[B:%.*]], i32* [[A]] +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[A]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SCCP/predicateinfo-cond.ll b/llvm/test/Transforms/SCCP/predicateinfo-cond.ll index 8ed96ec9301f5c..1443cc72c2ef8d 100644 --- a/llvm/test/Transforms/SCCP/predicateinfo-cond.ll +++ b/llvm/test/Transforms/SCCP/predicateinfo-cond.ll @@ -105,7 +105,7 @@ define void @pr46814(i32 %a) { ; CHECK-NEXT: [[C3:%.*]] = and i1 [[C1]], [[C2]] ; CHECK-NEXT: br i1 [[C3]], label [[IF_1:%.*]], label [[EXIT:%.*]] ; CHECK: if.1: -; CHECK-NEXT: br i1 true, label [[IF_2:%.*]], label [[EXIT]] +; CHECK-NEXT: br label [[IF_2:%.*]] ; CHECK: if.2: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[EXIT]] ; CHECK: exit: diff --git a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll index e1c7b3d5662d0b..9e9d1256c4cc01 100644 --- a/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll +++ b/llvm/test/Transforms/SCCP/resolvedundefsin-tracked-fn.ll @@ -136,13 +136,12 @@ define internal i1 @test2_g(%t1* %h, i32 %i) { ; CHECK-LABEL: define {{[^@]+}}@test2_g ; CHECK-SAME: (%t1* [[H:%.*]], i32 [[I:%.*]]) ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 true, label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; CHECK-NEXT: br label [[LAND_RHS:%.*]] ; CHECK: land.rhs: ; CHECK-NEXT: [[CALL:%.*]] = call i32 (...) @test2_j() ; CHECK-NEXT: [[TOBOOL1:%.*]] = icmp ne i32 [[CALL]], 0 -; CHECK-NEXT: br label [[LAND_END]] +; CHECK-NEXT: br label [[LAND_END:%.*]] ; CHECK: land.end: -; CHECK-NEXT: [[TMP0:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TOBOOL1]], [[LAND_RHS]] ] ; CHECK-NEXT: ret i1 undef ; entry: @@ -196,10 +195,9 @@ define internal i32 @test3_k(i8 %h, i32 %i) { ; CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[CONV]] to %t1* ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PHI:%.*]] = phi i1 [ undef, [[ENTRY:%.*]] ], [ false, [[LOOP]] ] ; CHECK-NEXT: [[CALL:%.*]] = call i1 @test3_g(%t1* [[TMP1]], i32 0) ; CHECK-NEXT: call void @use.1(i1 false) -; CHECK-NEXT: br i1 false, label [[LOOP]], label [[EXIT:%.*]] +; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: exit: ; CHECK-NEXT: ret i32 undef ; diff --git a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll index 7596a56b81229c..17b37f000407cb 100644 --- a/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll +++ b/llvm/test/Transforms/SCCP/switch-constantfold-crash.ll @@ -5,11 +5,11 @@ define void @barney() { ; CHECK-LABEL: @barney( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %bb9 +; CHECK-NEXT: br label [[BB9:%.*]] ; CHECK: bb6: ; CHECK-NEXT: unreachable ; CHECK: bb9: -; CHECK-NEXT: unreachable +; CHECK-NEXT: br label [[BB6:%.*]] ; bb: br label %bb9 @@ -29,9 +29,9 @@ bb9: ; preds = %bb define void @blam() { ; CHECK-LABEL: @blam( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %bb16 +; CHECK-NEXT: br label [[BB16:%.*]] ; CHECK: bb16: -; CHECK-NEXT: br label %bb38 +; CHECK-NEXT: br label [[BB38:%.*]] ; CHECK: bb38: ; CHECK-NEXT: unreachable ; @@ -62,9 +62,9 @@ bb38: ; preds = %bb16 define void @hoge() { ; CHECK-LABEL: @hoge( ; CHECK-NEXT: bb: -; CHECK-NEXT: br label %bb2 +; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: unreachable +; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: ; CHECK-NEXT: unreachable ; diff --git a/llvm/test/Transforms/SCCP/switch.ll b/llvm/test/Transforms/SCCP/switch.ll index d895c5629cd41f..3587587bcb91d7 100644 --- a/llvm/test/Transforms/SCCP/switch.ll +++ b/llvm/test/Transforms/SCCP/switch.ll @@ -23,15 +23,11 @@ define i32 @test_duplicate_successors_phi(i1 %c, i32 %x) { ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 [[C:%.*]], label [[SWITCH:%.*]], label [[END:%.*]] ; CHECK: switch: -; CHECK-NEXT: switch i32 -1, label [[SWITCH_DEFAULT:%.*]] [ -; CHECK-NEXT: i32 0, label [[END]] -; CHECK-NEXT: i32 1, label [[END]] -; CHECK-NEXT: ] +; CHECK-NEXT: br label [[SWITCH_DEFAULT:%.*]] ; CHECK: switch.default: ; CHECK-NEXT: ret i32 -1 ; CHECK: end: -; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[X:%.*]], [[ENTRY:%.*]] ], [ 1, [[SWITCH]] ], [ 1, [[SWITCH]] ] -; CHECK-NEXT: ret i32 [[PHI]] +; CHECK-NEXT: ret i32 [[X:%.*]] ; entry: br i1 %c, label %switch, label %end @@ -77,6 +73,53 @@ end: ret i32 %phi } +define i32 @test_duplicate_successors_phi_3(i1 %c1, i32 %x) { +; CHECK-LABEL: @test_duplicate_successors_phi_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C1:%.*]], label [[SWITCH:%.*]], label [[SWITCH_1:%.*]] +; CHECK: switch: +; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[X:%.*]], 3 +; CHECK-NEXT: call void @llvm.assume(i1 [[C2]]) +; CHECK-NEXT: switch i32 [[X]], label [[SWITCH_DEFAULT:%.*]] [ +; CHECK-NEXT: i32 0, label [[SWITCH_DEFAULT]] +; CHECK-NEXT: i32 1, label [[SWITCH_0:%.*]] +; CHECK-NEXT: i32 2, label [[SWITCH_0]] +; CHECK-NEXT: i32 3, label [[SWITCH_1]] +; CHECK-NEXT: i32 4, label [[SWITCH_1]] +; CHECK-NEXT: ] +; CHECK: switch.default: +; CHECK-NEXT: ret i32 -1 +; CHECK: switch.0: +; CHECK-NEXT: ret i32 0 +; CHECK: switch.1: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[X]], [[ENTRY:%.*]] ], [ 0, [[SWITCH]] ], [ 0, [[SWITCH]] ] +; CHECK-NEXT: ret i32 [[PHI]] +; +entry: + br i1 %c1, label %switch, label %switch.1 + +switch: + %c2 = icmp ult i32 %x, 3 + call void @llvm.assume(i1 %c2) + switch i32 %x, label %switch.default [ + i32 0, label %switch.default + i32 1, label %switch.0 + i32 2, label %switch.0 + i32 3, label %switch.1 + i32 4, label %switch.1 + ] + +switch.default: + ret i32 -1 + +switch.0: + ret i32 0 + +switch.1: + %phi = phi i32 [ %x, %entry ], [ 0, %switch ], [ 0, %switch ] + ret i32 %phi +} + define i32 @test_local_range(i32 %x) { ; CHECK-LABEL: @test_local_range( ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[X:%.*]], 3 diff --git a/llvm/test/Transforms/SCCP/widening.ll b/llvm/test/Transforms/SCCP/widening.ll index 2703bdb27dff4f..23a88c35a93eac 100644 --- a/llvm/test/Transforms/SCCP/widening.ll +++ b/llvm/test/Transforms/SCCP/widening.ll @@ -216,11 +216,11 @@ define void @rotated_loop_2(i32 %x) { ; IPSCCP: bb3: ; IPSCCP-NEXT: br label [[EXIT]] ; IPSCCP: exit: -; IPSCCP-NEXT: [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 3, [[BB1]] ], [ 2, [[BB2]] ], [ 5, [[BB3]] ], [ [[A:%.*]], [[EXIT]] ] -; IPSCCP-NEXT: [[A]] = add i32 [[P]], 1 +; IPSCCP-NEXT: [[P:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ 3, [[BB1]] ], [ 2, [[BB2]] ], [ 5, [[BB3]] ] +; IPSCCP-NEXT: [[A:%.*]] = add i32 [[P]], 1 ; IPSCCP-NEXT: call void @use(i1 true) ; IPSCCP-NEXT: call void @use(i1 false) -; IPSCCP-NEXT: br i1 false, label [[EXIT]], label [[EXIT_1:%.*]] +; IPSCCP-NEXT: br label [[EXIT_1:%.*]] ; IPSCCP: exit.1: ; IPSCCP-NEXT: ret void ; @@ -451,10 +451,10 @@ define void @foo(i64* %arg) { ; SCCP-NEXT: [[TMP7:%.*]] = sub i64 3, [[TMP6]] ; SCCP-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1 ; SCCP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; SCCP-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; SCCP-NEXT: [[TMP0:%.*]] = zext i32 [[TMP9]] to i64 ; SCCP-NEXT: br label [[BB11:%.*]] ; SCCP: bb11: -; SCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] +; SCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP0]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] ; SCCP-NEXT: br label [[BB13:%.*]] ; SCCP: bb13: ; SCCP-NEXT: [[C_1:%.*]] = icmp eq i64 [[TMP12]], 6 @@ -489,10 +489,10 @@ define void @foo(i64* %arg) { ; IPSCCP-NEXT: [[TMP7:%.*]] = sub i64 3, [[TMP6]] ; IPSCCP-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 1 ; IPSCCP-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 -; IPSCCP-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64 +; IPSCCP-NEXT: [[TMP0:%.*]] = zext i32 [[TMP9]] to i64 ; IPSCCP-NEXT: br label [[BB11:%.*]] ; IPSCCP: bb11: -; IPSCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] +; IPSCCP-NEXT: [[TMP12:%.*]] = phi i64 [ [[TMP0]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ] ; IPSCCP-NEXT: br label [[BB13:%.*]] ; IPSCCP: bb13: ; IPSCCP-NEXT: [[C_1:%.*]] = icmp eq i64 [[TMP12]], 6 diff --git a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp index 399c6dd4f7de01..7b2a985bba6616 100644 --- a/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp +++ b/llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp @@ -1,4 +1,4 @@ -//===- FunctionPropertiesAnalysisTest.cpp - function properties unit tests-===// +//===- FunctionPropertiesAnalysisTest.cpp - Function Properties Unit Tests-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,6 +8,7 @@ #include "llvm/Analysis/FunctionPropertiesAnalysis.h" #include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" @@ -15,42 +16,49 @@ #include "gtest/gtest.h" using namespace llvm; +namespace { -static std::unique_ptr parseIR(LLVMContext &C, const char *IR) { - SMDiagnostic Err; - std::unique_ptr Mod = parseAssemblyString(IR, Err, C); - if (!Mod) - Err.print("MLAnalysisTests", errs()); - return Mod; -} +class FunctionPropertiesAnalysisTest : public testing::Test { +protected: + std::unique_ptr DT; + std::unique_ptr LI; + + FunctionPropertiesInfo buildFPI(Function &F) { + DT.reset(new DominatorTree(F)); + LI.reset(new LoopInfo(*DT)); + return FunctionPropertiesInfo::getFunctionPropertiesInfo(F, *LI); + } -TEST(FunctionPropertiesTest, BasicTest) { + std::unique_ptr makeLLVMModule(LLVMContext &C, const char *IR) { + SMDiagnostic Err; + std::unique_ptr Mod = parseAssemblyString(IR, Err, C); + if (!Mod) + Err.print("MLAnalysisTests", errs()); + return Mod; + } +}; + +TEST_F(FunctionPropertiesAnalysisTest, BasicTest) { LLVMContext C; - std::unique_ptr M = parseIR(C, - R"IR( + std::unique_ptr M = makeLLVMModule(C, + R"IR( target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" - declare i32 @f1(i32) declare i32 @f2(i32) - define i32 @branches(i32) { %cond = icmp slt i32 %0, 3 br i1 %cond, label %then, label %else - then: %ret.1 = call i32 @f1(i32 %0) br label %last.block - else: %ret.2 = call i32 @f2(i32 %0) br label %last.block - last.block: %ret = phi i32 [%ret.1, %then], [%ret.2, %else] ret i32 %ret } - define internal i32 @top() { %1 = call i32 @branches(i32 2) %2 = call i32 @f1(i32 %1) @@ -58,20 +66,28 @@ define internal i32 @top() { } )IR"); - FunctionAnalysisManager FAM; - FunctionPropertiesAnalysis FA; - - auto BranchesFeatures = FA.run(*M->getFunction("branches"), FAM); + Function *BranchesFunction = M->getFunction("branches"); + FunctionPropertiesInfo BranchesFeatures = buildFPI(*BranchesFunction); EXPECT_EQ(BranchesFeatures.BasicBlockCount, 4); EXPECT_EQ(BranchesFeatures.BlocksReachedFromConditionalInstruction, 2); - EXPECT_EQ(BranchesFeatures.DirectCallsToDefinedFunctions, 0); // 2 Users: top is one. The other is added because @branches is not internal, // so it may have external callers. EXPECT_EQ(BranchesFeatures.Uses, 2); + EXPECT_EQ(BranchesFeatures.DirectCallsToDefinedFunctions, 0); + EXPECT_EQ(BranchesFeatures.LoadInstCount, 0); + EXPECT_EQ(BranchesFeatures.StoreInstCount, 0); + EXPECT_EQ(BranchesFeatures.MaxLoopDepth, 0); + EXPECT_EQ(BranchesFeatures.TopLevelLoopCount, 0); - auto TopFeatures = FA.run(*M->getFunction("top"), FAM); + Function *TopFunction = M->getFunction("top"); + FunctionPropertiesInfo TopFeatures = buildFPI(*TopFunction); EXPECT_EQ(TopFeatures.BasicBlockCount, 1); EXPECT_EQ(TopFeatures.BlocksReachedFromConditionalInstruction, 0); - EXPECT_EQ(TopFeatures.DirectCallsToDefinedFunctions, 1); EXPECT_EQ(TopFeatures.Uses, 0); + EXPECT_EQ(TopFeatures.DirectCallsToDefinedFunctions, 1); + EXPECT_EQ(BranchesFeatures.LoadInstCount, 0); + EXPECT_EQ(BranchesFeatures.StoreInstCount, 0); + EXPECT_EQ(BranchesFeatures.MaxLoopDepth, 0); + EXPECT_EQ(BranchesFeatures.TopLevelLoopCount, 0); } +} // end anonymous namespace \ No newline at end of file diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp index 6ffc181fed67a1..2c327276610aef 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -223,10 +223,10 @@ static bool isTiled(AffineMap map, ArrayRef tileSizes) { } static SmallVector makeTiledViews(OpBuilder &b, Location loc, - LinalgOp linalgOp, + LinalgOp linalgOp, AffineMap map, ArrayRef ivs, ArrayRef tileSizes, - ArrayRef viewSizes) { + ArrayRef allViewSizes) { assert(linalgOp.hasBufferSemantics() && "expected linalg op with buffer semantics"); assert(ivs.size() == static_cast(llvm::count_if( @@ -236,6 +236,7 @@ static SmallVector makeTiledViews(OpBuilder &b, Location loc, using namespace edsc::op; + auto viewSizes = applyMapToValues(b, loc, map, allViewSizes); // Construct (potentially temporary) mins and maxes on which to apply maps // that define tile subviews. SmallVector lbs, subViewSizes; @@ -356,7 +357,7 @@ Optional static tileLinalgOpImpl( return llvm::None; // 2. Build the tiled loop ranges. - auto viewSizes = getViewSizes(b, op); + auto allViewSizes = getViewSizes(b, op); // The flattened loopToOperandRangesMaps is expected to be an invertible // permutation map (asserted in the inverse calculation). auto mapsRange = op.indexing_maps().getAsRange(); @@ -369,7 +370,7 @@ Optional static tileLinalgOpImpl( SmallVector loopRanges; LoopIndexToRangeIndexMap loopIndexToRangeIndex; std::tie(loopRanges, loopIndexToRangeIndex) = makeTiledLoopRanges( - b, scope.getLocation(), viewSizesToLoopsMap, viewSizes, tileSizes); + b, scope.getLocation(), viewSizesToLoopsMap, allViewSizes, tileSizes); if (!options.interchangeVector.empty()) applyPermutationToVector(loopRanges, options.interchangeVector); @@ -395,7 +396,8 @@ Optional static tileLinalgOpImpl( if (!options.interchangeVector.empty()) ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues); - auto views = makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes); + auto views = makeTiledViews(b, loc, op, viewSizesToLoopsMap, ivValues, + tileSizes, allViewSizes); auto operands = getAssumedNonViewOperands(op); views.append(operands.begin(), operands.end()); res = op.clone(b, loc, views); diff --git a/mlir/test/Dialect/Linalg/tile_conv.mlir b/mlir/test/Dialect/Linalg/tile_conv.mlir index 1bbb8b60382ef9..a08a2f1e585c64 100644 --- a/mlir/test/Dialect/Linalg/tile_conv.mlir +++ b/mlir/test/Dialect/Linalg/tile_conv.mlir @@ -9,36 +9,38 @@ func @conv(%arg0: memref, %arg1: linalg.conv(%arg0, %arg1, %arg2) {dilations = [10, 20], strides = [30, 40]} : memref, memref, memref return } -// TILE-23004-LABEL: func @conv( -// TILE-23004: %{{.*}}: memref, %{{.*}}: memref, %{{.*}}: memref) { -// TILE-23004-DAG: %[[C0:.*]] = constant 0 : index -// TILE-23004-DAG: %[[C2:.*]] = constant 2 : index -// TILE-23004-DAG: %[[C3:.*]] = constant 3 : index -// TILE-23004-DAG: %[[C4:.*]] = constant 4 : index -// TILE-23004: %[[Q:.*]] = dim %{{.*}}, %c2 : memref -// TILE-23004: %[[B:.*]] = dim %{{.*}}, %c0 : memref -// TILE-23004: %[[PaddedInput0:.*]] = dim %{{.*}}, %c1 : memref -// TILE-23004: %[[X0:.*]] = dim %{{.*}}, %c1 : memref +// TILE-23004: func @conv( +// TILE-23004-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref +// TILE-23004-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref +// TILE-23004-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref) +// TILE-23004-DAG: %[[C0:.*]] = constant 0 : index +// TILE-23004-DAG: %[[C2:.*]] = constant 2 : index +// TILE-23004-DAG: %[[C3:.*]] = constant 3 : index +// TILE-23004-DAG: %[[C4:.*]] = constant 4 : index +// TILE-23004: %[[Z0:.*]] = dim %[[ARG0]], %c0 : memref +// TILE-23004: %[[Q:.*]] = dim %[[ARG0]], %c2 : memref +// TILE-23004: %[[B:.*]] = dim %[[ARG1]], %c0 : memref +// TILE-23004: %[[X0:.*]] = dim %[[ARG2]], %c1 : memref // TILE-23004: scf.for %[[ivI:.*]] = %{{.*}} to %[[B]] step %{{.*}} { // TILE-23004: scf.for %[[ivJ:.*]] = %{{.*}} to %[[X0]] step %{{.*}} { // TILE-23004: scf.for %[[ivK:.*]] = %{{.*}} to %[[Q]] step %{{.*}} { -// TILE-23004: %[[Z0:.*]] = dim %{{.*}}, %c0 : memref -// TILE-23004: %[[Z1:.*]] = dim %{{.*}}, %c1 : memref -// TILE-23004: %[[Z2:.*]] = dim %{{.*}}, %c2 : memref +// TILE-23004: %[[Z0_1:.*]] = dim %[[ARG0]], %c0 : memref +// TILE-23004: %[[Z1:.*]] = dim %[[ARG0]], %c1 : memref +// TILE-23004: %[[Z2:.*]] = dim %[[ARG0]], %c2 : memref // TILE-23004: %[[szK:.*]] = affine.min #[[$bound_map_4]](%[[ivK]])[%[[Z2]]] -// TILE-23004: %[[K:.*]] = dim %{{.*}}, %c3 : memref -// TILE-23004: %[[FilterView:.*]] = subview %{{.*}}[0, 0, %[[ivK]], 0] [%[[Z0]], %[[Z1]], %[[szK]], %[[K]]] [1, 1, 1, 1] : memref to memref +// TILE-23004: %[[K:.*]] = dim %[[ARG0]], %c3 : memref +// TILE-23004: %[[FilterView:.*]] = subview %{{.*}}[0, 0, %[[ivK]], 0] [%[[Z0_1]], %[[Z1]], %[[szK]], %[[K]]] [1, 1, 1, 1] : memref to memref // // TILE-23004: %[[J1:.*]] = affine.apply #[[$D0x30pS0x10]](%[[ivJ]]) -// TILE-23004: %[[PaddedInput0b:.*]] = dim %{{.*}}, %c1 : memref -// TILE-23004: %[[I1pStep:.*]] = affine.min #[[$S0x10p90D0x30pS1]](%[[ivJ]])[%[[PaddedInput0]], %[[PaddedInput0b]]] -// TILE-23004: %[[SZ2:.*]] = dim %{{.*}}, %c2 : memref -// TILE-23004: %[[dim3:.*]] = dim %{{.*}}, %c3 +// TILE-23004: %[[PaddedInput0b:.*]] = dim %[[ARG1]], %c1 : memref +// TILE-23004: %[[I1pStep:.*]] = affine.min #[[$S0x10p90D0x30pS1]](%[[ivJ]])[%[[Z0]], %[[PaddedInput0b]]] +// TILE-23004: %[[SZ2:.*]] = dim %[[ARG1]], %c2 : memref +// TILE-23004: %[[dim3:.*]] = dim %[[ARG1]], %c3 // TILE-23004: %[[sz3:.*]] = affine.min #[[$bound_map_4]](%[[ivK]])[%[[dim3]]] // TILE-23004: %[[InputView:.*]] = subview %{{.*}}[%[[ivI]], %[[J1]], 0, %[[ivK]]] [%{{.*}}, %{{.*}}, %[[SZ2]], %[[sz3]]] [1, 1, 1, 1] : memref to memref // -// TILE-23004: %[[X0:.*]] = dim %{{.*}}, %c2 : memref -// TILE-23004: %[[X1:.*]] = dim %{{.*}}, %c3 : memref +// TILE-23004: %[[X0:.*]] = dim %[[ARG2]], %c2 : memref +// TILE-23004: %[[X1:.*]] = dim %[[ARG2]], %c3 : memref // TILE-23004: %[[OutputView:.*]] = subview %{{.*}}[%[[ivI]], %[[ivJ]], 0, 0] [%{{.*}}, %{{.*}}, %[[X0]], %[[X1]]] [1, 1, 1, 1] : memref to memref // // TILE-23004: linalg.conv(%[[FilterView]], %[[InputView]], %[[OutputView]]) {dilations = [10, 20], strides = [30, 40]} : memref, memref, memref diff --git a/mlir/test/Dialect/Linalg/tile_simple_conv.mlir b/mlir/test/Dialect/Linalg/tile_simple_conv.mlir new file mode 100644 index 00000000000000..f854f7570fef3f --- /dev/null +++ b/mlir/test/Dialect/Linalg/tile_simple_conv.mlir @@ -0,0 +1,49 @@ +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" | FileCheck %s + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (2, -d0 + s0)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0)[s0, s1] -> (s0 + 3, -d0 + s1)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0)[s0, s1] -> (s0 + 4, -d0 + s1)> +// CHECK-DAG: #[[MAP4:.*]] = affine_map<(d0)[s0] -> (3, -d0 + s0)> +// CHECK-DAG: #[[MAP5:.*]] = affine_map<(d0)[s0] -> (4, -d0 + s0)> + +func @conv(%arg0 : memref, %arg1 : memref, %arg2 : memref) { + linalg.conv(%arg0, %arg1, %arg2) : memref, memref, memref + return +} + +// CHECK: func @conv +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]: memref +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]*]]: memref +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]*]]: memref +// CHECK-DAG: %[[C0:.*]] = constant 0 : index +// CHECK-DAG: %[[C1:.*]] = constant 1 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C3:.*]] = constant 3 : index +// CHECK-DAG: %[[C4:.*]] = constant 4 : index +// CHECK: %[[T0:.*]] = dim %[[ARG0]], %[[C0]] +// CHECK: %[[T1:.*]] = dim %[[ARG0]], %[[C1]] +// CHECK: %[[T2:.*]] = dim %[[ARG1]], %[[C0]] +// CHECK: %[[T3:.*]] = dim %[[ARG2]], %[[C1]] +// CHECK: %[[T4:.*]] = dim %[[ARG2]], %[[C2]] +// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[T2]] step %[[C2]] +// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[T3]] step %[[C3]] +// CHECK: scf.for %[[ARG5:.*]] = %[[C0]] to %[[T4]] step %[[C4]] +// CHECK: %[[T5:.*]] = dim %[[ARG1]], %[[C0]] +// CHECK: %[[T6:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T5]]] +// CHECK: %[[T7:.*]] = dim %[[ARG1]], %[[C1]] +// CHECK: %[[T8:.*]] = affine.min #[[MAP1]](%[[ARG4]])[%[[T0]], %[[T7]]] +// CHECK: %[[T9:.*]] = dim %[[ARG1]], %[[C2]] +// CHECK: %[[T10:.*]] = affine.min #[[MAP2]](%[[ARG5]])[%[[T1]], %[[T9]]] +// CHECK: %[[T11:.*]] = dim %[[ARG1]], %[[C3]] +// CHECK: %[[SV1:.*]] = subview %[[ARG1]][%[[ARG3]], %[[ARG4]], %[[ARG5]], 0] +// CHECK-SAME: [%[[T6]], %[[T8]], %[[T10]], %[[T11]]] +// CHECK: %[[T13:.*]] = dim %[[ARG2]], %[[C0]] +// CHECK: %[[T14:.*]] = affine.min #[[MAP0]](%[[ARG3]])[%[[T13]]] +// CHECK: %[[T15:.*]] = dim %[[ARG2]], %[[C1]] +// CHECK: %[[T16:.*]] = affine.min #[[MAP4]](%[[ARG4]])[%[[T15]]] +// CHECK: %[[T17:.*]] = dim %[[ARG2]], %[[C2]] +// CHECK: %[[T18:.*]] = affine.min #[[MAP5]](%[[ARG5]])[%[[T17]]] +// CHECK: %[[T19:.*]] = dim %[[ARG2]], %[[C3]] +// CHECK: %[[SV2:.*]] = subview %[[ARG2]][%[[ARG3]], %[[ARG4]], %[[ARG5]], 0] +// CHECK-SAME: [%[[T14]], %[[T16]], %[[T18]], %[[T19]]] +// CHECK: linalg.conv(%[[ARG0]], %[[SV1]], %[[SV2]]) \ No newline at end of file