From 4f4e68799fd55c7023e685161de6f6bb1ada16d5 Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Tue, 24 Mar 2020 16:58:15 -0400 Subject: [PATCH 01/33] [test][clang][driver] Add required features. - to avoid false alarms on builds without that features. --- clang/test/Driver/save-temps.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/test/Driver/save-temps.c b/clang/test/Driver/save-temps.c index b0cfa4fd814a88..a26ba9f4ec0d36 100644 --- a/clang/test/Driver/save-temps.c +++ b/clang/test/Driver/save-temps.c @@ -1,3 +1,6 @@ +// REQUIRES: x86-registered-target +// REQUIRES: arm-registered-target + // RUN: %clang -target x86_64-apple-darwin -save-temps -arch x86_64 %s -### 2>&1 \ // RUN: | FileCheck %s // CHECK: "-o" "save-temps.i" From 3f1defa6e2dfdea2a5980083de2b04068f640023 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Fri, 20 Mar 2020 14:18:04 -0700 Subject: [PATCH 02/33] [clang codegen] Clean up handling of vectors with trivial-auto-var-init. The code was pretending to be doing something useful with vectors, but really it was doing nothing: the element type of a vector is always a scalar type, so constWithPadding would always just return the input constant. Split off from D75661 so it can be reviewed separately. While I'm here, also add testcase to show missing vector handling. Differential Revision: https://reviews.llvm.org/D76528 --- clang/lib/CodeGen/CGDecl.cpp | 17 +++++++---------- clang/test/CodeGenCXX/auto-var-init.cpp | 19 +++++++++++++++++++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index c5de1e077b0cdb..a6417f05a7a63b 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -1050,13 +1050,13 @@ static llvm::Constant *constWithPadding(CodeGenModule &CGM, IsPattern isPattern, llvm::Type *OrigTy = constant->getType(); if (const auto STy = dyn_cast(OrigTy)) return constStructWithPadding(CGM, isPattern, STy, constant); - if (auto *STy = dyn_cast(OrigTy)) { + if (auto *ArrayTy = dyn_cast(OrigTy)) { llvm::SmallVector Values; - unsigned Size = STy->getNumElements(); + uint64_t Size = ArrayTy->getNumElements(); if (!Size) return constant; - llvm::Type *ElemTy = STy->getElementType(); - bool ZeroInitializer = constant->isZeroValue(); + llvm::Type *ElemTy = ArrayTy->getElementType(); + bool ZeroInitializer = constant->isNullValue(); llvm::Constant *OpValue, *PaddedOp; if (ZeroInitializer) { OpValue = llvm::Constant::getNullValue(ElemTy); @@ -1072,13 +1072,10 @@ static llvm::Constant *constWithPadding(CodeGenModule &CGM, IsPattern isPattern, auto *NewElemTy = Values[0]->getType(); if (NewElemTy == ElemTy) return constant; - if (OrigTy->isArrayTy()) { - auto *ArrayTy = llvm::ArrayType::get(NewElemTy, Size); - return llvm::ConstantArray::get(ArrayTy, Values); - } else { - return llvm::ConstantVector::get(Values); - } + auto *NewArrayTy = llvm::ArrayType::get(NewElemTy, Size); + return llvm::ConstantArray::get(NewArrayTy, Values); } + // FIXME: Do we need to handle tail padding in vectors? return constant; } diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp index 9cd71bdfd1a7dc..9399ec4eca089a 100644 --- a/clang/test/CodeGenCXX/auto-var-init.cpp +++ b/clang/test/CodeGenCXX/auto-var-init.cpp @@ -1610,5 +1610,24 @@ TEST_CUSTOM(doublevec32, double __attribute__((vector_size(32))), { 3.141592653 // CHECK-NEXT: store <4 x double> , <4 x double>* %custom, align [[ALIGN]] // CHECK-NEXT: call void @{{.*}}used{{.*}}%custom) +// TODO: This vector has tail padding +TEST_UNINIT(doublevec24, double __attribute__((vector_size(24)))); +// CHECK-LABEL: @test_doublevec24_uninit() +// CHECK: %uninit = alloca <3 x double>, align +// CHECK-NEXT: call void @{{.*}}used{{.*}}%uninit) +// PATTERN-LABEL: @test_doublevec24_uninit() +// PATTERN: store <3 x double> , <3 x double>* %uninit, align 32 +// ZERO-LABEL: @test_doublevec24_uninit() +// ZERO: store <3 x double> zeroinitializer, <3 x double>* %uninit, align 32 + +// TODO: This vector has tail padding +TEST_UNINIT(longdoublevec32, long double __attribute__((vector_size(sizeof(long double)*2)))); +// CHECK-LABEL: @test_longdoublevec32_uninit() +// CHECK: %uninit = alloca <2 x x86_fp80>, align +// CHECK-NEXT: call void @{{.*}}used{{.*}}%uninit) +// PATTERN-LABEL: @test_longdoublevec32_uninit() +// PATTERN: store <2 x x86_fp80> , <2 x x86_fp80>* %uninit, align 32 +// ZERO-LABEL: @test_longdoublevec32_uninit() +// ZERO: store <2 x x86_fp80> zeroinitializer, <2 x x86_fp80>* %uninit, align 32 } // extern "C" From a4e8d89704d2584d5c56cb27745beab25c7b9b36 Mon Sep 17 00:00:00 2001 From: Mitch Phillips <31459023+hctim@users.noreply.github.com> Date: Tue, 24 Mar 2020 14:37:07 -0700 Subject: [PATCH 03/33] [GWP-ASan] Only pack frames that are stored. Summary: Backtrace() returns the number of frames that are *available*, rather than the number of frames stored. When we pack, we supply the number of frames we have stored. The number of available frames can exceed the number of stored frames, leading to stack OOB read. Fix up this problem. Reviewers: eugenis Reviewed By: eugenis Subscribers: #sanitizers, morehouse, cferris, pcc Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D76722 --- compiler-rt/lib/gwp_asan/common.cpp | 5 +++ compiler-rt/lib/gwp_asan/tests/backtrace.cpp | 37 ++++++++++++++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/compiler-rt/lib/gwp_asan/common.cpp b/compiler-rt/lib/gwp_asan/common.cpp index 44935817f8e917..3438c4b91893bc 100644 --- a/compiler-rt/lib/gwp_asan/common.cpp +++ b/compiler-rt/lib/gwp_asan/common.cpp @@ -59,6 +59,11 @@ void AllocationMetadata::CallSiteInfo::RecordBacktrace( uintptr_t UncompressedBuffer[kMaxTraceLengthToCollect]; size_t BacktraceLength = Backtrace(UncompressedBuffer, kMaxTraceLengthToCollect); + // Backtrace() returns the number of available frames, which may be greater + // than the number of frames in the buffer. In this case, we need to only pack + // the number of frames that are in the buffer. + if (BacktraceLength > kMaxTraceLengthToCollect) + BacktraceLength = kMaxTraceLengthToCollect; TraceSize = compression::pack(UncompressedBuffer, BacktraceLength, CompressedTrace, AllocationMetadata::kStackFrameStorageBytes); diff --git a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp index bc81f35cb379fc..6c9a9309ed8b4d 100644 --- a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp +++ b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp @@ -8,6 +8,7 @@ #include +#include "gwp_asan/crash_handler.h" #include "gwp_asan/tests/harness.h" TEST_F(BacktraceGuardedPoolAllocator, DoubleFree) { @@ -15,13 +16,13 @@ TEST_F(BacktraceGuardedPoolAllocator, DoubleFree) { GPA.deallocate(Ptr); std::string DeathRegex = "Double Free.*"; - DeathRegex.append("backtrace\\.cpp:25.*"); + DeathRegex.append("backtrace\\.cpp:26.*"); DeathRegex.append("was deallocated.*"); - DeathRegex.append("backtrace\\.cpp:15.*"); + DeathRegex.append("backtrace\\.cpp:16.*"); DeathRegex.append("was allocated.*"); - DeathRegex.append("backtrace\\.cpp:14.*"); + DeathRegex.append("backtrace\\.cpp:15.*"); ASSERT_DEATH(GPA.deallocate(Ptr), DeathRegex); } @@ -30,12 +31,36 @@ TEST_F(BacktraceGuardedPoolAllocator, UseAfterFree) { GPA.deallocate(Ptr); std::string DeathRegex = "Use After Free.*"; - DeathRegex.append("backtrace\\.cpp:40.*"); + DeathRegex.append("backtrace\\.cpp:41.*"); DeathRegex.append("was deallocated.*"); - DeathRegex.append("backtrace\\.cpp:30.*"); + DeathRegex.append("backtrace\\.cpp:31.*"); DeathRegex.append("was allocated.*"); - DeathRegex.append("backtrace\\.cpp:29.*"); + DeathRegex.append("backtrace\\.cpp:30.*"); ASSERT_DEATH({ *Ptr = 7; }, DeathRegex); } + +TEST(Backtrace, Short) { + gwp_asan::AllocationMetadata Meta; + Meta.AllocationTrace.RecordBacktrace( + [](uintptr_t *TraceBuffer, size_t /* Size */) -> size_t { + TraceBuffer[0] = 123u; + TraceBuffer[1] = 321u; + return 2u; + }); + uintptr_t TraceOutput[2] = {}; + EXPECT_EQ(2u, __gwp_asan_get_allocation_trace(&Meta, TraceOutput, 2)); + EXPECT_EQ(TraceOutput[0], 123u); + EXPECT_EQ(TraceOutput[1], 321u); +} + +TEST(Backtrace, ExceedsStorableLength) { + gwp_asan::AllocationMetadata Meta; + Meta.AllocationTrace.RecordBacktrace( + [](uintptr_t * /* TraceBuffer */, size_t /* Size */) -> size_t { + return SIZE_MAX; // Wow, that's big! + }); + uintptr_t TraceOutput; + EXPECT_EQ(1u, __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); +} From 1ac403bd145dadfa1004af29bd6c77f871caf42c Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 27 Feb 2020 08:14:38 -0500 Subject: [PATCH 04/33] [libc++] Build the dylib with C++17 to allow aligned new/delete This allows simplifying the implementation of barriers. Differential Revision: https://reviews.llvm.org/D75243 --- libcxx/CMakeLists.txt | 6 +++--- libcxx/src/barrier.cpp | 12 +++--------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index f6097a37c71e35..6b2c4714dc0b0d 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -519,10 +519,10 @@ remove_flags(-Wno-pedantic -pedantic-errors -pedantic) # Required flags ============================================================== function(cxx_add_basic_build_flags target) - # Require C++14 for all targets. C++14 is needed to ensure constant - # initialization for certain globals (ex global memory resources). + # Require C++17 for all targets. C++17 is needed to use aligned allocation + # in the dylib. set_target_properties(${target} PROPERTIES - CXX_STANDARD 14 + CXX_STANDARD 17 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) diff --git a/libcxx/src/barrier.cpp b/libcxx/src/barrier.cpp index c5e33cbba3bd0b..9ee476993b8133 100644 --- a/libcxx/src/barrier.cpp +++ b/libcxx/src/barrier.cpp @@ -26,21 +26,15 @@ class __barrier_algorithm_base { } __tickets[64]; }; - ptrdiff_t& __expected; - unique_ptr __state_allocation; - __state_t* __state; + ptrdiff_t& __expected; + unique_ptr<__state_t[]> __state; _LIBCPP_HIDDEN __barrier_algorithm_base(ptrdiff_t& __expected) : __expected(__expected) { size_t const __count = (__expected + 1) >> 1; - size_t const __size = sizeof(__state_t) * __count; - size_t __allocation_size = __size + alignof(__state_t); - __state_allocation = unique_ptr(new char[__allocation_size]); - void* __allocation = __state_allocation.get(); - void* const __state_ = align(alignof(__state_t), __size, __allocation, __allocation_size); - __state = new (__state_) __barrier_algorithm_base::__state_t[__count]; + __state = unique_ptr<__state_t[]>(new __state_t[__count]); } _LIBCPP_HIDDEN bool __arrive(__barrier_phase_t __old_phase) From 7311b274039ea728938666972b1c34cf765a8a5f Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 24 Mar 2020 18:06:23 -0400 Subject: [PATCH 05/33] [libc++] Install a recent CMake in the Docker build bots --- libcxx/utils/docker/debian9/buildbot/install-packages.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/libcxx/utils/docker/debian9/buildbot/install-packages.sh b/libcxx/utils/docker/debian9/buildbot/install-packages.sh index 9956944799cac6..1f18c428004c2e 100755 --- a/libcxx/utils/docker/debian9/buildbot/install-packages.sh +++ b/libcxx/utils/docker/debian9/buildbot/install-packages.sh @@ -12,7 +12,6 @@ apt-get update && \ wget \ unzip \ python \ - cmake \ ninja-build \ curl \ git \ @@ -33,3 +32,8 @@ apt-get update && \ sysvinit-utils \ systemd-sysv && \ rm -rf /var/lib/apt/lists/* + +# Install a recent CMake +yes | apt-get purge cmake +wget https://github.com/Kitware/CMake/releases/download/v3.15.2/cmake-3.15.2-Linux-x86_64.sh -O /tmp/install-cmake.sh +bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license From ec184dd548f7832d09d3c6354a6d9d3db6cacc65 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 24 Mar 2020 22:39:25 +0100 Subject: [PATCH 06/33] [LVI] Convert some checks to assertions; NFC solveBlockValue() should only be called if the value isn't cached yet. Similarly, it does not make sense to "solve" a constant. --- llvm/lib/Analysis/LazyValueInfo.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 4e757ba6c19a4f..296f27799b9451 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -596,19 +596,9 @@ static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) { } bool LazyValueInfoImpl::solveBlockValue(Value *Val, BasicBlock *BB) { - if (isa(Val)) - return true; - - if (TheCache.hasCachedValueInfo(Val, BB)) { - // If we have a cached value, use that. - LLVM_DEBUG(dbgs() << " reuse BB '" << BB->getName() << "' val=" - << TheCache.getCachedValueInfo(Val, BB) << '\n'); - - // Since we're reusing a cached value, we don't need to update the - // OverDefinedCache. The cache will have been properly updated whenever the - // cached value was inserted. - return true; - } + assert(!isa(Val) && "Value should not be constant"); + assert(!TheCache.hasCachedValueInfo(Val, BB) && + "Value should not be in cache"); // Hold off inserting this value into the Cache in case we have to return // false and come back later. From b684c1a50f70a39ceb51973950c5cca520ce8b2c Mon Sep 17 00:00:00 2001 From: Dan Liew Date: Mon, 23 Mar 2020 19:57:33 -0700 Subject: [PATCH 07/33] Add a `Symbolizer::GetEnvP()` method that allows symbolizer implementations to customise the environment of the symbolizer binary. Summary: This change introduces the `Symbolizer::GetEnvP()` method that returns a pointer to environment array used for spawning the symbolizer process. The motivation is to allow implementations to customise the environment if required. The default implementation just returns `__sanitizer::GetEnviron()` which (provided it's implemented) should preserve the existing behaviours of the various implementations. This change has been plumbed through the `internal_spawn(...)` and `StartSubprocess(...)` process spawning implementations. For the `StartSubprocess()` implementation we need to call `execve()` rather than `execv()` to pass the environment. However, it appears that `internal_execve(...)` exists in sanitizer_common so this patch use that which seems like a nice clean up. Support in the Windows implementation of `SymbolizerProcess:StartSymbolizerSubprocess()` has not been added because the Windows sanitizer runtime doesn't implement `GetEnviron()`. rdar://problem/58789439 Reviewers: kubamracek, yln, dvyukov, vitalybuka, eugenis, phosek, aizatsky, rnk Subscribers: #sanitizers, llvm-commits Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D76666 --- compiler-rt/lib/sanitizer_common/sanitizer_file.h | 4 ++-- compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp | 11 ++++++----- compiler-rt/lib/sanitizer_common/sanitizer_posix.h | 2 +- .../lib/sanitizer_common/sanitizer_posix_libcdep.cpp | 6 ++++-- .../sanitizer_common/sanitizer_symbolizer_internal.h | 2 ++ .../sanitizer_symbolizer_posix_libcdep.cpp | 4 ++-- compiler-rt/lib/sanitizer_common/sanitizer_win.cpp | 3 ++- .../sanitizer_common/tests/sanitizer_linux_test.cpp | 2 +- 8 files changed, 20 insertions(+), 14 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_file.h b/compiler-rt/lib/sanitizer_common/sanitizer_file.h index 4a78a0e0ac8810..26681f0493d736 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_file.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_file.h @@ -87,8 +87,8 @@ bool IsAbsolutePath(const char *path); // The child process will close all fds after STDERR_FILENO // before passing control to a program. pid_t StartSubprocess(const char *filename, const char *const argv[], - fd_t stdin_fd = kInvalidFd, fd_t stdout_fd = kInvalidFd, - fd_t stderr_fd = kInvalidFd); + const char *const envp[], fd_t stdin_fd = kInvalidFd, + fd_t stdout_fd = kInvalidFd, fd_t stderr_fd = kInvalidFd); // Checks if specified process is still running bool IsProcessRunning(pid_t pid); // Waits for the process to finish and returns its exit code. diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index c025f70df028ab..618f902a3098c3 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -246,7 +246,8 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp, (size_t)newlen); } -static fd_t internal_spawn_impl(const char *argv[], pid_t *pid) { +static fd_t internal_spawn_impl(const char *argv[], const char *envp[], + pid_t *pid) { fd_t master_fd = kInvalidFd; fd_t slave_fd = kInvalidFd; @@ -302,8 +303,8 @@ static fd_t internal_spawn_impl(const char *argv[], pid_t *pid) { // posix_spawn char **argv_casted = const_cast(argv); - char **env = GetEnviron(); - res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, env); + char **envp_casted = const_cast(envp); + res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, envp_casted); if (res != 0) return kInvalidFd; // Disable echo in the new terminal, disable CR. @@ -320,7 +321,7 @@ static fd_t internal_spawn_impl(const char *argv[], pid_t *pid) { return fd; } -fd_t internal_spawn(const char *argv[], pid_t *pid) { +fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid) { // The client program may close its stdin and/or stdout and/or stderr thus // allowing open/posix_openpt to reuse file descriptors 0, 1 or 2. In this // case the communication is broken if either the parent or the child tries to @@ -335,7 +336,7 @@ fd_t internal_spawn(const char *argv[], pid_t *pid) { break; } - fd_t fd = internal_spawn_impl(argv, pid); + fd_t fd = internal_spawn_impl(argv, envp, pid); for (; count > 0; count--) { internal_close(low_fds[count]); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h index 70c71f04d2d345..a1b49702da23b7 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h @@ -63,7 +63,7 @@ uptr internal_ptrace(int request, int pid, void *addr, void *data); uptr internal_waitpid(int pid, int *status, int options); int internal_fork(); -fd_t internal_spawn(const char *argv[], pid_t *pid); +fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid); int internal_sysctl(const int *name, unsigned int namelen, void *oldp, uptr *oldlenp, const void *newp, uptr newlen); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index 304b3a01a08b6a..f920172c06d634 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -426,7 +426,8 @@ void AdjustStackSize(void *attr_) { #endif // !SANITIZER_GO pid_t StartSubprocess(const char *program, const char *const argv[], - fd_t stdin_fd, fd_t stdout_fd, fd_t stderr_fd) { + const char *const envp[], fd_t stdin_fd, fd_t stdout_fd, + fd_t stderr_fd) { auto file_closer = at_scope_exit([&] { if (stdin_fd != kInvalidFd) { internal_close(stdin_fd); @@ -469,7 +470,8 @@ pid_t StartSubprocess(const char *program, const char *const argv[], for (int fd = sysconf(_SC_OPEN_MAX); fd > 2; fd--) internal_close(fd); - execv(program, const_cast(&argv[0])); + internal_execve(program, const_cast(&argv[0]), + const_cast(envp)); internal__exit(1); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h index c04797dd61b8b6..06395433084227 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h @@ -86,6 +86,8 @@ class SymbolizerProcess { // Customizable by subclasses. virtual bool StartSymbolizerSubprocess(); virtual bool ReadFromSymbolizer(char *buffer, uptr max_length); + // Return the environment to run the symbolizer in. + virtual char **GetEnvP() { return GetEnviron(); } private: virtual bool ReachedEndOfOutput(const char *buffer, uptr length) const { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp index c123ecb11206cc..4c3cd966dd5a0a 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp @@ -153,7 +153,7 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() { if (use_posix_spawn_) { #if SANITIZER_MAC - fd_t fd = internal_spawn(argv, &pid); + fd_t fd = internal_spawn(argv, const_cast(GetEnvP()), &pid); if (fd == kInvalidFd) { Report("WARNING: failed to spawn external symbolizer (errno: %d)\n", errno); @@ -173,7 +173,7 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() { return false; } - pid = StartSubprocess(path_, argv, /* stdin */ outfd[0], + pid = StartSubprocess(path_, argv, GetEnvP(), /* stdin */ outfd[0], /* stdout */ infd[1]); if (pid < 0) { internal_close(infd[0]); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp index 73dc042b69f171..fca15beb61612d 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp @@ -1064,7 +1064,8 @@ char **GetEnviron() { } pid_t StartSubprocess(const char *program, const char *const argv[], - fd_t stdin_fd, fd_t stdout_fd, fd_t stderr_fd) { + const char *const envp[], fd_t stdin_fd, fd_t stdout_fd, + fd_t stderr_fd) { // FIXME: implement on this platform // Should be implemented based on // SymbolizerProcess::StarAtSymbolizerSubprocess diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp index 1d8e7e8af26ca3..cb6c0724ac884b 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp @@ -264,7 +264,7 @@ TEST(SanitizerCommon, StartSubprocessTest) { const char *shell = "/bin/sh"; #endif const char *argv[] = {shell, "-c", "echo -n 'hello'", (char *)NULL}; - int pid = StartSubprocess(shell, argv, + int pid = StartSubprocess(shell, argv, GetEnviron(), /* stdin */ kInvalidFd, /* stdout */ pipe_fds[1]); ASSERT_GT(pid, 0); From 46bb6613a31fd43b6d4485ce7e71a387dc22cbc7 Mon Sep 17 00:00:00 2001 From: MaheshRavishankar Date: Tue, 24 Mar 2020 15:55:36 -0700 Subject: [PATCH 08/33] [mlir][GPU] Use StructAttr to drive lowering from loop.parallel to gpu.launch Current implementation of lowering from loop.parallel to gpu.launch uses a DictionaryAttr to specify the mapping. Moving this attribute to be auto-generated from specification as a StructAttr. This simplifies a lot the logic of looking up and creating this attribute. Differential Revision: https://reviews.llvm.org/D76165 --- mlir/include/mlir/Dialect/GPU/CMakeLists.txt | 12 ++- mlir/include/mlir/Dialect/GPU/GPUBase.td | 58 ++++++++++++ mlir/include/mlir/Dialect/GPU/GPUOps.td | 37 +------- .../mlir/Dialect/GPU/ParallelLoopMapper.h | 49 +++++++--- .../Dialect/GPU/ParallelLoopMapperAttr.td | 51 ++++++++++ mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp | 93 ++++++++++--------- mlir/lib/Dialect/GPU/CMakeLists.txt | 2 + .../GPU/Transforms/ParallelLoopMapper.cpp | 92 +++++++++++++++--- .../Conversion/LoopsToGPU/parallel_loop.mlir | 4 +- 9 files changed, 284 insertions(+), 114 deletions(-) create mode 100644 mlir/include/mlir/Dialect/GPU/GPUBase.td create mode 100644 mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt index 3122b8421cbb5d..d341303d62da29 100644 --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1,2 +1,12 @@ -add_mlir_dialect(GPUOps gpu) +add_mlir_dialect(GPUOps gpu GPUOps) add_mlir_doc(GPUOps -gen-dialect-doc GPUDialect Dialects/) + +set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) +mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls) +mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs) +add_public_tablegen_target(MLIRParallelLoopMapperAttrGen) + +set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) +mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls) +mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs) +add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen) diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td new file mode 100644 index 00000000000000..39e2f1a940d95e --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -0,0 +1,58 @@ +//===-- GPUBase.td - GPU dialect definitions ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the GPU dialect +// +//===----------------------------------------------------------------------===// + +#ifndef GPU_BASE +#define GPU_BASE + +include "mlir/IR/OpBase.td" + +//===----------------------------------------------------------------------===// +// GPU Dialect. +//===----------------------------------------------------------------------===// + +def GPU_Dialect : Dialect { + let name = "gpu"; + let hasOperationAttrVerify = 1; + + let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate the modules that contain + /// kernel modules. + static StringRef getContainerModuleAttrName() { + return "gpu.container_module"; + } + /// Get the name of the attribute used to annotate external kernel + /// functions. + static StringRef getKernelFuncAttrName() { return "gpu.kernel"; } + + /// Get the name of the attribute used to annotate kernel modules. + static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; } + + /// Returns whether the given function is a kernel function, i.e., has the + /// 'gpu.kernel' attribute. + static bool isKernel(Operation *op); + + /// Returns the number of workgroup (thread, block) dimensions supported in + /// the GPU dialect. + // TODO(zinenko,herhut): consider generalizing this. + static unsigned getNumWorkgroupDimensions() { return 3; } + + /// Returns the numeric value used to identify the workgroup memory address + /// space. + static unsigned getWorkgroupAddressSpace() { return 3; } + + /// Returns the numeric value used to identify the private memory address + /// space. + static unsigned getPrivateAddressSpace() { return 5; } + }]; +} + +#endif // GPU_BASE diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 659c10142e8155..6feaf82405f0f4 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -13,6 +13,7 @@ #ifndef GPU_OPS #define GPU_OPS +include "mlir/Dialect/GPU/GPUBase.td" include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Interfaces/SideEffects.td" @@ -26,42 +27,6 @@ def IntLikeOrLLVMInt : TypeConstraint< // GPU Dialect operations. //===----------------------------------------------------------------------===// -def GPU_Dialect : Dialect { - let name = "gpu"; - let hasOperationAttrVerify = 1; - - let extraClassDeclaration = [{ - /// Get the name of the attribute used to annotate the modules that contain - /// kernel modules. - static StringRef getContainerModuleAttrName() { - return "gpu.container_module"; - } - /// Get the name of the attribute used to annotate external kernel - /// functions. - static StringRef getKernelFuncAttrName() { return "gpu.kernel"; } - - /// Get the name of the attribute used to annotate kernel modules. - static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; } - - /// Returns whether the given function is a kernel function, i.e., has the - /// 'gpu.kernel' attribute. - static bool isKernel(Operation *op); - - /// Returns the number of workgroup (thread, block) dimensions supported in - /// the GPU dialect. - // TODO(zinenko,herhut): consider generalizing this. - static unsigned getNumWorkgroupDimensions() { return 3; } - - /// Returns the numeric value used to identify the workgroup memory address - /// space. - static unsigned getWorkgroupAddressSpace() { return 3; } - - /// Returns the numeric value used to identify the private memory address - /// space. - static unsigned getPrivateAddressSpace() { return 5; } - }]; -} - class GPU_Op traits = []> : Op; diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h index 92fb09ff602086..6bbcafb919253d 100644 --- a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h @@ -14,28 +14,48 @@ #ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H #define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H +#include "mlir/IR/Attributes.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/DenseMap.h" + +#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc" + namespace mlir { +class AffineMap; +struct LogicalResult; +class Operation; class Region; +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc" + +namespace loop { +class ParallelOp; +} + namespace gpu { /// Name of the mapping attribute produced by loop mappers. -static constexpr const char *kMappingAttributeName = "mapping"; -/// Name of the processor sub-attribute that identifies the hardware id -/// to map a loop to. -static constexpr const char *kProcessorEntryName = "processor"; -/// Name of the map sub-attribute that identifies the affine map to apply -/// to the hardware id to compute the iteration number of the loop. This -/// map is expected to be extended by step and lower bound computations: -/// index = map(hardware_id) * step + lowerbound -static constexpr const char *kIndexMapEntryName = "map"; -/// Name of the bound sub-attribute that itendities the affine map to -/// compute an upper bound of iterations for the hardware id. This is -/// applied to an upper bound on the number of iterations: -/// launchBound = bound(upperbound-lowerbound ceildiv step) -static constexpr const char *kBoundMapEntryName = "bound"; +StringRef getMappingAttrName(); +/// Get the value of the processor in the ParallelLoopDimMapping attribute. +inline Processor getProcessor(ParallelLoopDimMapping attr) { + return static_cast(attr.processor().getInt()); +} + +/// Helper function to create a ParallelDimMapperAttr. +/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method. +ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor, + AffineMap map, + AffineMap bound); + +/// Sets the mapping attribute of a loop.parallel operation. Verifies that the +/// mapping passed is valid. +/// - the number of DimMapperAttr provided is same as the number of loops of +/// the `ploopOp`. +/// - the mapping does not map multiple loops to the same processor. +LogicalResult setMappingAttr(loop::ParallelOp ploopOp, + ArrayRef mapping); } // end namespace gpu /// Maps the parallel loops found in the given function to workgroups. The first @@ -46,5 +66,4 @@ static constexpr const char *kBoundMapEntryName = "bound"; void greedilyMapParallelLoopsToGPU(Region ®ion); } // end namespace mlir - #endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td new file mode 100644 index 00000000000000..1bfdfe5ebcfc2a --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td @@ -0,0 +1,51 @@ +//===-- ParallelLoopMapperAttr.td - Attribute definition ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the attribute used for driving conversion from loop.parallel to +// gpu.launch operations +// +//===----------------------------------------------------------------------===// + +#ifndef PARALLEL_LOOP_MAPPER_ATTR +#define PARALLEL_LOOP_MAPPER_ATTR + +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" +include "mlir/Dialect/GPU/GPUBase.td" + +def BlockX : I64EnumAttrCase<"BlockX", 0>; +def BlockY : I64EnumAttrCase<"BlockY", 1>; +def BlockZ : I64EnumAttrCase<"BlockZ", 2>; +def ThreadX : I64EnumAttrCase<"ThreadX", 3>; +def ThreadY : I64EnumAttrCase<"ThreadY", 4>; +def ThreadZ : I64EnumAttrCase<"ThreadZ", 5>; +def Sequential : I64EnumAttrCase<"Sequential", 6>; + +def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [ + BlockX, BlockY, BlockZ, ThreadX, ThreadY, ThreadZ, Sequential]> { + let cppNamespace = "::mlir::gpu"; +} + +// Attribute that drives conversion of a loop.parallel to gpu.launch +// operation. +// processor: the hardware id to map to. +// map : An affine map that is used to pre-process hardware ids before +// substitution. +// bound : An affine map that is used to compute the bound of the hardware +// id based on an upper bound of the number of iterations. +def ParallelLoopDimMappingAttr : + StructAttr<"ParallelLoopDimMapping", GPU_Dialect, + [StructFieldAttr<"processor", ProcessorAttr>, + StructFieldAttr<"map", AffineMapAttr>, + StructFieldAttr<"bound", AffineMapAttr>]>; + + +def ParallelLoopMappingAttr : + TypedArrayAttrBase; + +#endif // PARALLEL_LOOP_MAPPER_ATTR diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp index ec5c9cff2ee054..b9c81ea45592ed 100644 --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -500,35 +500,8 @@ struct ParallelToGpuLaunchLowering : public OpRewritePattern { LogicalResult matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const override; }; - -struct MappingAnnotation { - unsigned processor; - AffineMap indexMap; - AffineMap boundMap; -}; - } // namespace -/// Extracts the mapping annotations from the provided attribute. The attribute -/// is expected to be of the form -/// { processor = , map = , bound = } -/// where the bound is optional. -static MappingAnnotation extractMappingAnnotation(Attribute attribute) { - DictionaryAttr dict = attribute.cast(); - unsigned processor = dict.get(gpu::kProcessorEntryName) - .cast() - .getValue() - .getSExtValue(); - AffineMap map = - dict.get(gpu::kIndexMapEntryName).cast().getValue(); - AffineMapAttr boundAttr = - dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null(); - AffineMap bound; - if (boundAttr) - bound = boundAttr.getValue(); - return {processor, map, bound}; -} - /// Tries to derive a static upper bound from the defining operation of /// `upperBound`. static Value deriveStaticUpperBound(Value upperBound, @@ -546,6 +519,30 @@ static Value deriveStaticUpperBound(Value upperBound, return {}; } +static bool isMappedToProcessor(gpu::Processor processor) { + return processor != gpu::Processor::Sequential; +} + +static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { + switch (processor) { + case gpu::Processor::BlockX: + return 0; + case gpu::Processor::BlockY: + return 1; + case gpu::Processor::BlockZ: + return 2; + case gpu::Processor::ThreadX: + return 3; + case gpu::Processor::ThreadY: + return 4; + case gpu::Processor::ThreadZ: + return 5; + default:; + } + llvm_unreachable( + "invalid processor type while retrieving launch op argument number"); +} + /// Modifies the current transformation state to capture the effect of the given /// `loop.parallel` operation on index substitutions and the operations to be /// inserted. @@ -568,16 +565,14 @@ static Value deriveStaticUpperBound(Value upperBound, /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the /// worklist. This signals the processor of the worklist to pop the rewriter /// one scope-level up. -static LogicalResult processParallelLoop(ParallelOp parallelOp, - gpu::LaunchOp launchOp, - BlockAndValueMapping &cloningMap, - SmallVectorImpl &worklist, - DenseMap &bounds, - PatternRewriter &rewriter) { +static LogicalResult processParallelLoop( + ParallelOp parallelOp, gpu::LaunchOp launchOp, + BlockAndValueMapping &cloningMap, SmallVectorImpl &worklist, + DenseMap &bounds, PatternRewriter &rewriter) { // TODO(herhut): Verify that this is a valid GPU mapping. // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential ArrayAttr mapping = - parallelOp.getAttrOfType(gpu::kMappingAttributeName); + parallelOp.getAttrOfType(gpu::getMappingAttrName()); // TODO(herhut): Support reductions. if (!mapping || parallelOp.getNumResults() != 0) @@ -604,12 +599,17 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp, Attribute mappingAttribute; Value iv, lowerBound, upperBound, step; std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; - MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute); + auto annotation = mappingAttribute.dyn_cast(); + if (!annotation) + return parallelOp.emitOpError() + << "expected mapping attribute for lowering to GPU"; Value newIndex; + gpu::Processor processor = gpu::getProcessor(annotation); - if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) { + if (isMappedToProcessor(processor)) { // Use the corresponding thread/grid index as replacement for the loop iv. - Value operand = launchOp.body().front().getArgument(annotation.processor); + Value operand = launchOp.body().front().getArgument( + getLaunchOpArgumentNum(processor)); // Take the indexmap and add the lower bound and step computations in. // This computes operand * step + lowerBound. // Use an affine map here so that it composes nicely with the provided @@ -619,11 +619,11 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp, rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1)); newIndex = rewriter.create( - loc, annotation.indexMap.compose(lowerAndStep), + loc, annotation.map().getValue().compose(lowerAndStep), ValueRange{operand, step, lowerBound}); // If there was also a bound, insert that, too. // TODO(herhut): Check that we do not assign bounds twice. - if (annotation.boundMap) { + if (annotation.bound().getValue()) { // We pass as the single opererand to the bound-map the number of // iterations, which is (upperBound - lowerBound) ceilDiv step. To // support inner loops with dynamic upper bounds (as generated by e.g. @@ -663,19 +663,21 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp, rewriter.getAffineSymbolExpr(1)) .ceilDiv(rewriter.getAffineSymbolExpr(2)))); Value launchBound = rewriter.create( - loc, annotation.boundMap.compose(stepMap), + loc, annotation.bound().getValue().compose(stepMap), ValueRange{ ensureLaunchIndependent( cloningMap.lookupOrDefault(upperBound)), ensureLaunchIndependent( cloningMap.lookupOrDefault(lowerBound)), ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); - if (bounds.find(annotation.processor) != bounds.end()) { + // todo(herhut,ravishankarm): Update the behavior of setMappingAttr + // when this condition is relaxed. + if (bounds.find(processor) != bounds.end()) { return parallelOp.emitOpError() << "cannot redefine the bound for processor " - << annotation.processor; + << static_cast(processor); } - bounds[annotation.processor] = launchBound; + bounds[processor] = launchBound; } if (!boundIsPrecise) { // We are using an approximation, create a surrounding conditional. @@ -757,7 +759,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, rewriter.setInsertionPointToStart(&launchOp.body().front()); BlockAndValueMapping cloningMap; - llvm::DenseMap launchBounds; + llvm::DenseMap launchBounds; SmallVector worklist; if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, launchBounds, rewriter))) @@ -809,7 +811,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, // Now that we succeeded creating the launch operation, also update the // bounds. for (auto bound : launchBounds) - launchOp.setOperand(std::get<0>(bound), std::get<1>(bound)); + launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), + std::get<1>(bound)); rewriter.eraseOp(parallelOp); return success(); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index b83c3ca6a42132..e71a018a451c33 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -10,6 +10,8 @@ add_mlir_dialect_library(MLIRGPU DEPENDS MLIRGPUOpsIncGen + MLIRParallelLoopMapperAttrGen + MLIRParallelLoopMapperEnumsGen ) target_link_libraries(MLIRGPU PUBLIC diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index f85a0c7027295b..9697688ac850c2 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -23,6 +23,43 @@ using namespace mlir; using namespace mlir::gpu; using namespace mlir::loop; +#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc" +namespace mlir { + +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc" +namespace gpu { + +StringRef getMappingAttrName() { return "mapping"; } + +ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor, + AffineMap map, + AffineMap bound) { + MLIRContext *context = map.getContext(); + OpBuilder builder(context); + return ParallelLoopDimMapping::get( + builder.getI64IntegerAttr(static_cast(processor)), + AffineMapAttr::get(map), AffineMapAttr::get(bound), context); +} + +LogicalResult setMappingAttr(loop::ParallelOp ploopOp, + ArrayRef mapping) { + // Verify that each processor is mapped to only once. + llvm::DenseSet specifiedMappings; + for (auto dimAttr : mapping) { + gpu::Processor processor = getProcessor(dimAttr); + if (processor != gpu::Processor::Sequential && + specifiedMappings.count(processor)) + return ploopOp.emitError( + "invalid mapping multiple loops to same processor"); + } + ArrayRef mappingAsAttrs(mapping.data(), mapping.size()); + ploopOp.setAttr(getMappingAttrName(), + ArrayAttr::get(mappingAsAttrs, ploopOp.getContext())); + return success(); +} +} // namespace gpu +} // namespace mlir + namespace { enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; @@ -43,10 +80,41 @@ MappingLevel &operator++(MappingLevel &mappingLevel) { /// Computed the hardware id to use for a given mapping level. Will /// assign x,y and z hardware ids for the first 3 dimensions and use /// sequential after. -static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { +/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is +/// distributed to map to x, the next innermost to y and the next innermost to +/// z. +static gpu::Processor getHardwareIdForMapping(MappingLevel level, + int dimension) { + if (dimension >= kNumHardwareIds || level == Sequential) - return Sequential * kNumHardwareIds; - return (level * kNumHardwareIds) + dimension; + return Processor::Sequential; + switch (level) { + case MapGrid: + switch (dimension) { + case 0: + return Processor::BlockX; + case 1: + return Processor::BlockY; + case 2: + return Processor::BlockZ; + default: + return Processor::Sequential; + } + break; + case MapBlock: + switch (dimension) { + case 0: + return Processor::ThreadX; + case 1: + return Processor::ThreadY; + case 2: + return Processor::ThreadZ; + default: + return Processor::Sequential; + } + default:; + } + return Processor::Sequential; } /// Add mapping information to the given parallel loop. Do not add @@ -55,26 +123,20 @@ static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { static void mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid) { // Do not try to add a mapping to already mapped loops or nested loops. - if (parallelOp.getAttr(gpu::kMappingAttributeName) || + if (parallelOp.getAttr(getMappingAttrName()) || ((mappingLevel == MapGrid) && parallelOp.getParentOfType())) return; MLIRContext *ctx = parallelOp.getContext(); Builder b(ctx); - SmallVector attrs; + SmallVector attrs; attrs.reserve(parallelOp.getNumInductionVars()); for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) { - SmallVector entries; - entries.emplace_back(b.getNamedAttr( - kProcessorEntryName, - b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i)))); - entries.emplace_back(b.getNamedAttr( - kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); - entries.emplace_back(b.getNamedAttr( - kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); - attrs.push_back(DictionaryAttr::get(entries, ctx)); + attrs.push_back(getParallelLoopDimMappingAttr( + getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(), + b.getDimIdentityMap())); } - parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx)); + setMappingAttr(parallelOp, attrs); ++mappingLevel; // Parallel loop operations are immediately nested, so do not use // walk but just iterate over the operations. diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir index 4bc97da954ff9d..ab195936d83af8 100644 --- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -3,7 +3,7 @@ // 2-d parallel loop mapped to block.y and block.x func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index, - %arg3 : index, %arg4 : index, + %arg3 : index, %arg4 : index, %buf : memref, %res : memref) { %step = constant 2 : index @@ -334,7 +334,7 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind // expected-error@+1 {{failed to legalize operation 'loop.parallel'}} loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%four, %four) { - // expected-error@+1 {{cannot derive loop-invariant upper bound}} + // expected-error@+1 {{cannot derive loop-invariant upper bound}} loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1) step (%one, %one) { %idx0 = addi %i0, %si0 : index From 39c55cef21b4ca9863edf82b6d9995fb1e28c37d Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 13 Feb 2020 15:52:32 -0500 Subject: [PATCH 09/33] GlobalISel: Introduce bitcast legalize action For some operations, the type is unimportant and only the number of bits matters. For example I don't want to treat <4 x s8> as a legal type, but I also don't want to decompose loads of this into smaller pieces to get legal register types. On AMDGPU in SelectionDAG, we legalize a number of operations (most notably load and store) by coercing all types to vectors of i32. For GlobalISel, I'm trying very hard to avoid doing this for every type, but I don't think this strategy can be completely avoided. I'm trying to avoid bitcasts for any legitimately legal type we can operate on, since the intervening bitcasts have proven to be a hassle. For loads, I think I can get away without ever casting the result type, and handling any arbitrary bitwidth during selection (I will eventually want new tablegen support to help with this, rather than having to add every possible type as legal). The unmerge required to do anything with the value should expand to the expected shifts. This is trickier for stores, since it would now require handling a wide array of truncates during selection which I don't want. Future potentially interesting case are for vector indexing, where sub-dword type should be indexed in s32 pieces. --- .../llvm/CodeGen/GlobalISel/LegalizerHelper.h | 11 ++ .../llvm/CodeGen/GlobalISel/LegalizerInfo.h | 3 + .../CodeGen/GlobalISel/LegalizerHelper.cpp | 71 ++++++++ llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp | 7 + .../GlobalISel/LegalizerHelperTest.cpp | 157 ++++++++++++++++++ .../CodeGen/GlobalISel/LegalizerInfoTest.cpp | 1 + 6 files changed, 250 insertions(+) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 08d12a57adff0e..60c0a71762e510 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -74,6 +74,9 @@ class LegalizerHelper { /// precision, ignoring the unused bits). LegalizeResult widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy); + /// Legalize an instruction by replacing the value type + LegalizeResult bitcast(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + /// Legalize an instruction by splitting it into simpler parts, hopefully /// understood by the target. LegalizeResult lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty); @@ -128,6 +131,14 @@ class LegalizerHelper { /// original vector type, and replacing the vreg of the operand in place. void moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, unsigned OpIdx); + /// Legalize a single operand \p OpIdx of the machine instruction \p MI as a + /// use by inserting a G_BITCAST to \p CastTy + void bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx); + + /// Legalize a single operand \p OpIdx of the machine instruction \p MI as a + /// def by inserting a G_BITCAST from \p CastTy + void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx); + private: LegalizeResult widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index e2ed44e283d889..624fa70f1aa69a 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -68,6 +68,9 @@ enum LegalizeAction : std::uint8_t { /// the first two results. MoreElements, + /// Perform the operation on a different, but equivalently sized type. + Bitcast, + /// The operation itself must be expressed in terms of simpler actions on /// this target. E.g. a SREM replaced by an SDIV and subtraction. Lower, diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 63dff5c96fd6cc..ac9b35aeb18c6d 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -120,6 +120,9 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { case WidenScalar: LLVM_DEBUG(dbgs() << ".. Widen scalar\n"); return widenScalar(MI, Step.TypeIdx, Step.NewType); + case Bitcast: + LLVM_DEBUG(dbgs() << ".. Bitcast type\n"); + return bitcast(MI, Step.TypeIdx, Step.NewType); case Lower: LLVM_DEBUG(dbgs() << ".. Lower\n"); return lower(MI, Step.TypeIdx, Step.NewType); @@ -1251,6 +1254,19 @@ void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, MO.setReg(MoreReg); } +void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { + MachineOperand &Op = MI.getOperand(OpIdx); + Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0)); +} + +void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + Register CastDst = MRI.createGenericVirtualRegister(CastTy); + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); + MIRBuilder.buildBitcast(MO, CastDst); + MO.setReg(CastDst); +} + LegalizerHelper::LegalizeResult LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { @@ -2104,6 +2120,61 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) { return UnableToLegalize; } +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { + MIRBuilder.setInstr(MI); + + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: { + if (TypeIdx != 0) + return UnableToLegalize; + + Observer.changingInstr(MI); + bitcastDst(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_STORE: { + if (TypeIdx != 0) + return UnableToLegalize; + + Observer.changingInstr(MI); + bitcastSrc(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_SELECT: { + if (TypeIdx != 0) + return UnableToLegalize; + + if (MRI.getType(MI.getOperand(1).getReg()).isVector()) { + LLVM_DEBUG( + dbgs() << "bitcast action not implemented for vector select\n"); + return UnableToLegalize; + } + + Observer.changingInstr(MI); + bitcastSrc(MI, CastTy, 2); + bitcastSrc(MI, CastTy, 3); + bitcastDst(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: { + Observer.changingInstr(MI); + bitcastSrc(MI, CastTy, 1); + bitcastSrc(MI, CastTy, 2); + bitcastDst(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + default: + return UnableToLegalize; + } +} + LegalizerHelper::LegalizeResult LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { using namespace TargetOpcode; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index cad08a39593623..2658434fe94916 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -59,6 +59,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, LegalizeAction Action) { case MoreElements: OS << "MoreElements"; break; + case Bitcast: + OS << "Bitcast"; + break; case Lower: OS << "Lower"; break; @@ -173,6 +176,9 @@ static bool mutationIsSane(const LegalizeRule &Rule, return true; } + case Bitcast: { + return OldTy != NewTy && OldTy.getSizeInBits() == NewTy.getSizeInBits(); + } default: return true; } @@ -575,6 +581,7 @@ LegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) { LegalizeAction Action = Vec[VecIdx].second; switch (Action) { case Legal: + case Bitcast: case Lower: case Libcall: case Custom: diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index 3bc8f1f8f24059..244cb75c3e2970 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -2565,4 +2565,161 @@ TEST_F(AArch64GISelMITest, WidenUnmerge) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } +TEST_F(AArch64GISelMITest, BitcastLoad) { + setUp(); + if (!TM) + return; + + LLT P0 = LLT::pointer(0, 64); + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + auto Ptr = B.buildUndef(P0); + + DefineLegalizerInfo(A, {}); + + MachineMemOperand *MMO = B.getMF().getMachineMemOperand( + MachinePointerInfo(), MachineMemOperand::MOLoad, 4, 4); + auto Load = B.buildLoad(V4S8, Ptr, *MMO); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Load, 0, S32)); + + auto CheckStr = R"( + CHECK: [[PTR:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD + CHECK: [[CAST:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[LOAD]] + + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + +TEST_F(AArch64GISelMITest, BitcastStore) { + setUp(); + if (!TM) + return; + + LLT P0 = LLT::pointer(0, 64); + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + auto Ptr = B.buildUndef(P0); + + DefineLegalizerInfo(A, {}); + + MachineMemOperand *MMO = B.getMF().getMachineMemOperand( + MachinePointerInfo(), MachineMemOperand::MOStore, 4, 4); + auto Val = B.buildUndef(V4S8); + auto Store = B.buildStore(Val, Ptr, *MMO); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Store, 0, S32)); + + auto CheckStr = R"( + CHECK: [[VAL:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + CHECK: [[CAST:%[0-9]+]]:_(s32) = G_BITCAST [[VAL]] + CHECK: G_STORE [[CAST]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + +TEST_F(AArch64GISelMITest, BitcastSelect) { + setUp(); + if (!TM) + return; + + LLT S1 = LLT::scalar(1); + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + + DefineLegalizerInfo(A, {}); + + auto Cond = B.buildUndef(S1); + auto Val0 = B.buildConstant(V4S8, 123); + auto Val1 = B.buildConstant(V4S8, 99); + + auto Select = B.buildSelect(V4S8, Cond, Val0, Val1); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Select, 0, S32)); + + auto CheckStr = R"( + CHECK: [[VAL0:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[VAL1:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[CAST0:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST1:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT %{{[0-9]+}}:_(s1), [[CAST0]]:_, [[CAST1]]:_ + CHECK: [[CAST2:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[SELECT]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; + + // Doesn't make sense + auto VCond = B.buildUndef(LLT::vector(4, 1)); + auto VSelect = B.buildSelect(V4S8, VCond, Val0, Val1); + EXPECT_EQ(LegalizerHelper::LegalizeResult::UnableToLegalize, + Helper.bitcast(*VSelect, 0, S32)); + EXPECT_EQ(LegalizerHelper::LegalizeResult::UnableToLegalize, + Helper.bitcast(*VSelect, 1, LLT::scalar(4))); +} + +TEST_F(AArch64GISelMITest, BitcastBitOps) { + setUp(); + if (!TM) + return; + + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + + DefineLegalizerInfo(A, {}); + + auto Val0 = B.buildConstant(V4S8, 123); + auto Val1 = B.buildConstant(V4S8, 99); + auto And = B.buildAnd(V4S8, Val0, Val1); + auto Or = B.buildOr(V4S8, Val0, Val1); + auto Xor = B.buildXor(V4S8, Val0, Val1); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*And, 0, S32)); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Or, 0, S32)); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Xor, 0, S32)); + + auto CheckStr = R"( + CHECK: [[VAL0:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[VAL1:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[CAST0:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST1:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[CAST0]]:_, [[CAST1]]:_ + CHECK: [[CAST_AND:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[AND]] + CHECK: [[CAST2:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST3:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[CAST2]]:_, [[CAST3]]:_ + CHECK: [[CAST_OR:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[OR]] + CHECK: [[CAST4:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST5:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[CAST4]]:_, [[CAST5]]:_ + CHECK: [[CAST_XOR:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[XOR]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + } // namespace diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp index b342143e13942f..7fd2ea453a2ac8 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp @@ -27,6 +27,7 @@ operator<<(std::ostream &OS, const LegalizeAction Act) { case MoreElements: OS << "MoreElements"; break; case Libcall: OS << "Libcall"; break; case Custom: OS << "Custom"; break; + case Bitcast: OS << "Bitcast"; break; case Unsupported: OS << "Unsupported"; break; case NotFound: OS << "NotFound"; break; case UseLegacyRules: OS << "UseLegacyRules"; break; From ed8ad6ec1562a980d8d6a8fea29703030acc125b Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Mar 2020 15:31:39 -0700 Subject: [PATCH 10/33] Add an -object-path-prefix option to dsymutil to remap object file paths (but no source paths) before processing. This is meant to be used for Clang objects where the module cache location was remapped using ``-fdebug-prefix-map``; to help dsymutil find the Clang module cache. Differential Revision: https://reviews.llvm.org/D76391 --- llvm/docs/CommandGuide/dsymutil.rst | 6 ++++++ llvm/include/llvm/DWARFLinker/DWARFLinker.h | 9 +++++++++ llvm/lib/DWARFLinker/DWARFLinker.cpp | 10 ++++++++++ llvm/test/tools/dsymutil/X86/object-prefix-path.test | 11 +++++++++++ llvm/test/tools/dsymutil/cmdline.test | 1 + llvm/tools/dsymutil/DwarfLinkerForBinary.cpp | 1 + llvm/tools/dsymutil/LinkUtils.h | 3 +++ llvm/tools/dsymutil/Options.td | 9 +++++++++ llvm/tools/dsymutil/dsymutil.cpp | 6 ++++++ 9 files changed, 56 insertions(+) create mode 100644 llvm/test/tools/dsymutil/X86/object-prefix-path.test diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst index 4aa9c1c7c49dd3..2f638daf2ae3fe 100644 --- a/llvm/docs/CommandGuide/dsymutil.rst +++ b/llvm/docs/CommandGuide/dsymutil.rst @@ -71,6 +71,12 @@ OPTIONS Specifies a ``path`` to prepend to all debug symbol object file paths. +.. option:: --object-prefix-map= + + Remap object file paths (but no source paths) before processing. Use + this for Clang objects where the module cache location was remapped using + ``-fdebug-prefix-map``; to help dsymutil find the Clang module cache. + .. option:: --papertrail When running dsymutil as part of your build system, it can be desirable for diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/DWARFLinker.h index af49665391bbeb..d75ed8b98949ba 100644 --- a/llvm/include/llvm/DWARFLinker/DWARFLinker.h +++ b/llvm/include/llvm/DWARFLinker/DWARFLinker.h @@ -226,6 +226,7 @@ typedef std::function(StringRef ContainerName, StringRef Path)> objFileLoader; typedef std::map swiftInterfacesMap; +typedef std::map objectPrefixMap; /// The core of the Dwarf linking logic. /// @@ -311,6 +312,11 @@ class DWARFLinker { Options.ParseableSwiftInterfaces = Map; } + /// Set prefix map for objects. + void setObjectPrefixMap(objectPrefixMap *Map) { + Options.ObjectPrefixMap = Map; + } + private: /// Flags passed to DwarfLinker::lookForDIEsToKeep enum TraversalFlags { @@ -783,6 +789,9 @@ class DWARFLinker { /// per compile unit, which is why this is a std::map. /// this is dsymutil specific fag. swiftInterfacesMap *ParseableSwiftInterfaces = nullptr; + + /// A list of remappings to apply to file paths. + objectPrefixMap *ObjectPrefixMap = nullptr; } Options; }; diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index ab18cb005a3a81..6444af9046b8dd 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -1918,6 +1918,14 @@ static uint64_t getDwoId(const DWARFDie &CUDie, const DWARFUnit &Unit) { return 0; } +static std::string remapPath(StringRef Path, + const objectPrefixMap &ObjectPrefixMap) { + for (const auto &Entry : ObjectPrefixMap) + if (Path.startswith(Entry.first)) + return (Twine(Entry.second) + Path.substr(Entry.first.size())).str(); + return Path.str(); +} + bool DWARFLinker::registerModuleReference( DWARFDie CUDie, const DWARFUnit &Unit, const DwarfFile &File, OffsetsStringPool &StringPool, UniquingStringPool &UniquingStringPool, @@ -1927,6 +1935,8 @@ bool DWARFLinker::registerModuleReference( CUDie.find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), ""); if (PCMfile.empty()) return false; + if (Options.ObjectPrefixMap) + PCMfile = remapPath(PCMfile, *Options.ObjectPrefixMap); // Clang module DWARF skeleton CUs abuse this for the path to the module. uint64_t DwoId = getDwoId(CUDie, Unit); diff --git a/llvm/test/tools/dsymutil/X86/object-prefix-path.test b/llvm/test/tools/dsymutil/X86/object-prefix-path.test new file mode 100644 index 00000000000000..16956e0f94521e --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/object-prefix-path.test @@ -0,0 +1,11 @@ +RUN: rm -rf %t.dir && mkdir %t.dir && mkdir %t.dir/ModuleCacheRenamed +RUN: cp %p/../Inputs/module-warnings/1.o %t.dir +RUN: cp %p/../Inputs/module-warnings/Foo.pcm %t.dir/ModuleCacheRenamed + +RUN: dsymutil -verify -f -oso-prepend-path=%t.dir -y \ +RUN: %p/dummy-debug-map.map -o %t \ +RUN: -object-prefix-map=/ModuleCache=/ModuleCacheRenamed \ +RUN: 2>&1 | FileCheck %s + +CHECK: warning: {{.*}}Bar.pcm: +CHECK-NOT: warning: {{.*}}Foo.pcm: diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test index fc3f00b369fdee..701de29637dd84 100644 --- a/llvm/test/tools/dsymutil/cmdline.test +++ b/llvm/test/tools/dsymutil/cmdline.test @@ -12,6 +12,7 @@ HELP: -no-odr HELP: -no-output HELP: -no-swiftmodule-timestamp HELP: -num-threads +HELP: -object-prefix-map HELP: -oso-prepend-path HELP: -o HELP: -papertrail diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index 51912c78769bf1..c1c247d1baa02a 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -297,6 +297,7 @@ bool DwarfLinkerForBinary::link(const DebugMap &Map) { remarks::RemarkLinker RL; if (!Options.RemarksPrependPath.empty()) RL.setExternalFilePrependPath(Options.RemarksPrependPath); + GeneralLinker.setObjectPrefixMap(&Options.ObjectPrefixMap); std::function TranslationLambda = [&](StringRef Input) { assert(Options.Translator); diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h index 92de81da8fa08c..0339f4485e73b2 100644 --- a/llvm/tools/dsymutil/LinkUtils.h +++ b/llvm/tools/dsymutil/LinkUtils.h @@ -57,6 +57,9 @@ struct LinkOptions { /// -oso-prepend-path std::string PrependPath; + /// The -object-prefix-map. + std::map ObjectPrefixMap; + /// The Resources directory in the .dSYM bundle. Optional ResourceDir; diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td index eb86c2f3ae1b3e..5360bf09ac7518 100644 --- a/llvm/tools/dsymutil/Options.td +++ b/llvm/tools/dsymutil/Options.td @@ -110,6 +110,15 @@ def oso_prepend_path: Separate<["--", "-"], "oso-prepend-path">, Group; def: Joined<["--", "-"], "oso-prepend-path=">, Alias; +def object_prefix_map: Separate<["--", "-"], "object-prefix-map">, + MetaVarName<"">, + HelpText<"Remap object file paths (but no source paths) before processing." + "Use this for Clang objects where the module cache location was" + "remapped using -fdebug-prefix-map; to help dsymutil" + "find the Clang module cache.">, + Group; +def: Joined<["--", "-"], "object-prefix-map=">, Alias; + def symbolmap: Separate<["--", "-"], "symbol-map">, MetaVarName<"">, HelpText<"Updates the existing dSYMs inplace using symbol map specified.">, diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 54adeaa11c1a29..8b70673785daff 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -246,6 +246,12 @@ static Expected getOptions(opt::InputArgList &Args) { if (opt::Arg *OsoPrependPath = Args.getLastArg(OPT_oso_prepend_path)) Options.LinkOpts.PrependPath = OsoPrependPath->getValue(); + for (const auto &Arg : Args.getAllArgValues(OPT_object_prefix_map)) { + auto Split = StringRef(Arg).split('='); + Options.LinkOpts.ObjectPrefixMap.insert( + {std::string(Split.first), std::string(Split.second)}); + } + if (opt::Arg *OutputFile = Args.getLastArg(OPT_output)) Options.OutputFile = OutputFile->getValue(); From dca920a904c27f4c86e909ef2e4e343d48168cca Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Wed, 18 Mar 2020 15:30:20 -0700 Subject: [PATCH 11/33] Allow remapping Clang module include paths rdar://problem/55685132 Differential Revision: https://reviews.llvm.org/D76385 --- clang/lib/CodeGen/CGDebugInfo.cpp | 30 +++++++++++++------- clang/test/Modules/debug-info-moduleimport.m | 2 ++ 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index da6cb458982ba1..49c57e9860a60e 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -2483,6 +2483,17 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, assert(StringRef(M->Name).startswith(CGM.getLangOpts().ModuleName) && "clang module without ASTFile must be specified by -fmodule-name"); + // Return a StringRef to the remapped Path. + auto RemapPath = [this](StringRef Path) -> std::string { + std::string Remapped = remapDIPath(Path); + StringRef Relative(Remapped); + StringRef CompDir = TheCU->getDirectory(); + if (Relative.consume_front(CompDir)) + Relative.consume_front(llvm::sys::path::get_separator()); + + return Relative.str(); + }; + if (CreateSkeletonCU && IsRootModule && !Mod.getASTFile().empty()) { // PCH files don't have a signature field in the control block, // but LLVM detects skeleton CUs by looking for a non-zero DWO id. @@ -2496,16 +2507,12 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, if (!llvm::sys::path::is_absolute(Mod.getASTFile())) PCM = Mod.getPath(); llvm::sys::path::append(PCM, Mod.getASTFile()); - std::string RemappedPCM = remapDIPath(PCM); - StringRef RelativePCM(RemappedPCM); - StringRef CompDir = TheCU->getDirectory(); - if (RelativePCM.consume_front(CompDir)) - RelativePCM.consume_front(llvm::sys::path::get_separator()); - DIB.createCompileUnit(TheCU->getSourceLanguage(), - // TODO: Support "Source" from external AST providers? - DIB.createFile(Mod.getModuleName(), CompDir), - TheCU->getProducer(), false, StringRef(), 0, RelativePCM, - llvm::DICompileUnit::FullDebug, Signature); + DIB.createCompileUnit( + TheCU->getSourceLanguage(), + // TODO: Support "Source" from external AST providers? + DIB.createFile(Mod.getModuleName(), TheCU->getDirectory()), + TheCU->getProducer(), false, StringRef(), 0, RemapPath(PCM), + llvm::DICompileUnit::FullDebug, Signature); DIB.finalize(); } @@ -2513,9 +2520,10 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, IsRootModule ? nullptr : getOrCreateModuleRef(ASTSourceDescriptor(*M->Parent), CreateSkeletonCU); + std::string IncludePath = Mod.getPath().str(); llvm::DIModule *DIMod = DBuilder.createModule(Parent, Mod.getModuleName(), ConfigMacros, - Mod.getPath()); + RemapPath(IncludePath)); ModuleCache[M].reset(DIMod); return DIMod; } diff --git a/clang/test/Modules/debug-info-moduleimport.m b/clang/test/Modules/debug-info-moduleimport.m index 9dee9964b538ea..5787ffe2275133 100644 --- a/clang/test/Modules/debug-info-moduleimport.m +++ b/clang/test/Modules/debug-info-moduleimport.m @@ -34,9 +34,11 @@ // RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps \ // RUN: -fmodules-cache-path=%t -fdebug-prefix-map=%t=/MODULE-CACHE \ +// RUN: -fdebug-prefix-map=%S=/SRCDIR \ // RUN: -fmodule-format=obj -dwarf-ext-refs \ // RUN: %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - \ // RUN: | FileCheck %s --check-prefix=SKEL-CHECK +// SKEL-CHECK: includePath: "/SRCDIR/Inputs" // SKEL-CHECK: distinct !DICompileUnit({{.*}}file: ![[CUFILE:[0-9]+]] // SKEL-CHECK: ![[CUFILE]] = !DIFile({{.*}}directory: "[[COMP_DIR:.*]]" // SKEL-CHECK: distinct !DICompileUnit({{.*}}file: ![[DWOFILE:[0-9]+]]{{.*}}splitDebugFilename: "/MODULE-CACHE{{.*}}dwoId From 01a337cfc92c57d979939fae9394fbbede63f64b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Mar 2020 20:27:00 -0400 Subject: [PATCH 12/33] AMDGPU/GlobalISel: Add missing tests for G_FRINT selection --- .../AMDGPU/GlobalISel/inst-select-frint.mir | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir new file mode 100644 index 00000000000000..45a8551ee47e3b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir @@ -0,0 +1,105 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=bonaire -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: frint_s32_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: frint_s32_vv + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_FRINT %0 + $vgpr0 = COPY %1 +... + +--- +name: frint_s32_vs +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: frint_s32_vs + ; GCN: liveins: $sgpr0 + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = G_FRINT %0 + $vgpr0 = COPY %1 +... + +--- +name: frint_fneg_s32_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: frint_fneg_s32_vv + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 1, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_FNEG %0 + %2:vgpr(s32) = G_FRINT %1 + $vgpr0 = COPY %2 +... + +--- +name: frint_s64_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GCN-LABEL: name: frint_s64_vv + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 0, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0_vgpr1 = COPY [[V_RNDNE_F64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_FRINT %0 + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: frint_s64_fneg_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GCN-LABEL: name: frint_s64_fneg_vv + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 1, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0_vgpr1 = COPY [[V_RNDNE_F64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_FNEG %0 + %2:vgpr(s64) = G_FRINT %1 + $vgpr0_vgpr1 = COPY %2 +... From c9e0b448b8cd322227a6d906760f2837e1832f8a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Mar 2020 20:33:39 -0400 Subject: [PATCH 13/33] AMDGPU/GlobalISel: Add load legalization tests --- .../legalize-sextload-constant-32bit.mir | 168 +++++++++++++++++ .../legalize-zextload-constant-32bit.mir | 170 ++++++++++++++++++ 2 files changed, 338 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir new file mode 100644 index 00000000000000..b7896fc8184d6f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir @@ -0,0 +1,168 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=CI %s + +--- +name: test_sextload_constant32bit_s64_s32_align4 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s64_s32_align4 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 4, addrspace 6) + ; CI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[SEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_SEXTLOAD %0 :: (load 4, align 4, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s64_s32_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s64_s32_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 2, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[OR]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[SEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_SEXTLOAD %0 :: (load 4, align 2, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s64_s32_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s64_s32_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C2]](s64) + ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 1, addrspace 6) + ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C3]](s64) + ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1, addrspace 6) + ; CI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C6]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) + ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C7]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[OR2]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[SEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_SEXTLOAD %0 :: (load 4, align 1, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s32_s8_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s32_s8_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; CI: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_SEXTLOAD %0 :: (load 1, align 1, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s32_s16_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s32_s16_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; CI: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_SEXTLOAD %0 :: (load 2, align 2, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s32_s16_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s32_s16_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[OR]](s16) + ; CI: $vgpr0 = COPY [[SEXT]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_SEXTLOAD %0 :: (load 2, align 1, addrspace 6) + $vgpr0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir new file mode 100644 index 00000000000000..9d342e8d6f5e7b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir @@ -0,0 +1,170 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=CI %s + +--- +name: test_zextload_constant32bit_s64_s32_align4 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s64_s32_align4 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 4, addrspace 6) + ; CI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_ZEXTLOAD %0 :: (load 4, align 4, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s64_s32_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s64_s32_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 2, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_ZEXTLOAD %0 :: (load 4, align 2, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s64_s32_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s64_s32_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C2]](s64) + ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 1, addrspace 6) + ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C3]](s64) + ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1, addrspace 6) + ; CI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C6]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) + ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C7]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[ZEXT2]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_ZEXTLOAD %0 :: (load 4, align 1, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s32_s8_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s32_s8_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: $vgpr0 = COPY [[AND]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_ZEXTLOAD %0 :: (load 1, align 1, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s32_s16_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s32_s16_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: $vgpr0 = COPY [[AND]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_ZEXTLOAD %0 :: (load 2, align 2, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s32_s16_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s32_s16_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: $vgpr0 = COPY [[ZEXT]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_ZEXTLOAD %0 :: (load 2, align 1, addrspace 6) + $vgpr0 = COPY %1 +... From bba8c92d541a2f0b003df6cb79c61a5664750f9f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Mar 2020 20:47:54 -0400 Subject: [PATCH 14/33] AMDGPU/GlobalISel: Add select patterns for v_and_or_b32 --- .../GlobalISel/inst-select-pattern-and-or.mir | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir new file mode 100644 index 00000000000000..0230dd835215fb --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir @@ -0,0 +1,176 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +name: and_or_s32_sgpr_sgpr_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8-LABEL: name: and_or_s32_sgpr_sgpr_sgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc + ; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_]] + ; GFX9-LABEL: name: and_or_s32_sgpr_sgpr_sgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX9: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX9: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc + ; GFX9: S_ENDPGM 0, implicit [[S_OR_B32_]] + ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_sgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX10: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc + ; GFX10: S_ENDPGM 0, implicit [[S_OR_B32_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_AND %0, %1 + %4:sgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: and_or_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: and_or_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: and_or_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + ; GFX10-LABEL: name: and_or_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_AND %0, %1 + %4:vgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: and_or_s32_vgpr_vgpr_vgpr_commute +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY2]], [[V_AND_B32_e64_]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + ; GFX10-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_AND %0, %1 + %4:vgpr(s32) = G_OR %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- + +name: and_or_s32_sgpr_sgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX8-LABEL: name: and_or_s32_sgpr_sgpr_vgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX8: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: and_or_s32_sgpr_sgpr_vgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GFX9: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_vgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GFX10: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:vgpr(s32) = COPY $vgpr0 + %3:sgpr(s32) = G_AND %0, %1 + %4:vgpr(s32) = COPY %3 + %5:vgpr(s32) = G_OR %4, %2 + S_ENDPGM 0, implicit %5 +... From c025235e962e2cec58e47a572eac3f88a55f3a1e Mon Sep 17 00:00:00 2001 From: Adrian Prantl Date: Tue, 24 Mar 2020 17:56:36 -0700 Subject: [PATCH 15/33] Revert "Allow remapping Clang module include paths" to investigate why this commit broke a test in the LLDB testsuite. This reverts commit dca920a904c27f4c86e909ef2e4e343d48168cca. --- clang/lib/CodeGen/CGDebugInfo.cpp | 30 +++++++------------- clang/test/Modules/debug-info-moduleimport.m | 2 -- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 49c57e9860a60e..da6cb458982ba1 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -2483,17 +2483,6 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, assert(StringRef(M->Name).startswith(CGM.getLangOpts().ModuleName) && "clang module without ASTFile must be specified by -fmodule-name"); - // Return a StringRef to the remapped Path. - auto RemapPath = [this](StringRef Path) -> std::string { - std::string Remapped = remapDIPath(Path); - StringRef Relative(Remapped); - StringRef CompDir = TheCU->getDirectory(); - if (Relative.consume_front(CompDir)) - Relative.consume_front(llvm::sys::path::get_separator()); - - return Relative.str(); - }; - if (CreateSkeletonCU && IsRootModule && !Mod.getASTFile().empty()) { // PCH files don't have a signature field in the control block, // but LLVM detects skeleton CUs by looking for a non-zero DWO id. @@ -2507,12 +2496,16 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, if (!llvm::sys::path::is_absolute(Mod.getASTFile())) PCM = Mod.getPath(); llvm::sys::path::append(PCM, Mod.getASTFile()); - DIB.createCompileUnit( - TheCU->getSourceLanguage(), - // TODO: Support "Source" from external AST providers? - DIB.createFile(Mod.getModuleName(), TheCU->getDirectory()), - TheCU->getProducer(), false, StringRef(), 0, RemapPath(PCM), - llvm::DICompileUnit::FullDebug, Signature); + std::string RemappedPCM = remapDIPath(PCM); + StringRef RelativePCM(RemappedPCM); + StringRef CompDir = TheCU->getDirectory(); + if (RelativePCM.consume_front(CompDir)) + RelativePCM.consume_front(llvm::sys::path::get_separator()); + DIB.createCompileUnit(TheCU->getSourceLanguage(), + // TODO: Support "Source" from external AST providers? + DIB.createFile(Mod.getModuleName(), CompDir), + TheCU->getProducer(), false, StringRef(), 0, RelativePCM, + llvm::DICompileUnit::FullDebug, Signature); DIB.finalize(); } @@ -2520,10 +2513,9 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, IsRootModule ? nullptr : getOrCreateModuleRef(ASTSourceDescriptor(*M->Parent), CreateSkeletonCU); - std::string IncludePath = Mod.getPath().str(); llvm::DIModule *DIMod = DBuilder.createModule(Parent, Mod.getModuleName(), ConfigMacros, - RemapPath(IncludePath)); + Mod.getPath()); ModuleCache[M].reset(DIMod); return DIMod; } diff --git a/clang/test/Modules/debug-info-moduleimport.m b/clang/test/Modules/debug-info-moduleimport.m index 5787ffe2275133..9dee9964b538ea 100644 --- a/clang/test/Modules/debug-info-moduleimport.m +++ b/clang/test/Modules/debug-info-moduleimport.m @@ -34,11 +34,9 @@ // RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps \ // RUN: -fmodules-cache-path=%t -fdebug-prefix-map=%t=/MODULE-CACHE \ -// RUN: -fdebug-prefix-map=%S=/SRCDIR \ // RUN: -fmodule-format=obj -dwarf-ext-refs \ // RUN: %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - \ // RUN: | FileCheck %s --check-prefix=SKEL-CHECK -// SKEL-CHECK: includePath: "/SRCDIR/Inputs" // SKEL-CHECK: distinct !DICompileUnit({{.*}}file: ![[CUFILE:[0-9]+]] // SKEL-CHECK: ![[CUFILE]] = !DIFile({{.*}}directory: "[[COMP_DIR:.*]]" // SKEL-CHECK: distinct !DICompileUnit({{.*}}file: ![[DWOFILE:[0-9]+]]{{.*}}splitDebugFilename: "/MODULE-CACHE{{.*}}dwoId From d16ee1174a9043d468cf925133133b0d58f3d937 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Mar 2020 21:21:59 -0400 Subject: [PATCH 16/33] AMDGPU/GlobalISel: Add some end to end tests for fma selection --- llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 612 +++++++++++++++++++++ 1 file changed, 612 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll new file mode 100644 index 00000000000000..d8fbcfaac2009a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -0,0 +1,612 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define float @v_fma_f32(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) { +; GFX6-LABEL: v_fma_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) + ret <2 x float> %fma +} + +define half @v_fma_f16(half %x, half %y, half %z) { +; GFX6-LABEL: v_fma_f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %fma +} + +define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) + ret <2 x half> %fma +} + +define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16_fneg_lhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %x.fneg = fneg <2 x half> %x + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y, <2 x half> %z) + ret <2 x half> %fma +} + +define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16_fneg_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %y.fneg = fneg <2 x half> %y + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y.fneg, <2 x half> %z) + ret <2 x half> %fma +} + +define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 +; GFX6-NEXT: s_mov_b32 s4, 0x80008000 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v2, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %x.fneg = fneg <2 x half> %x + %y.fneg = fneg <2 x half> %y + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg, <2 x half> %z) + ret <2 x half> %fma +} + +; FIXME: +; define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { +; %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) +; ret <3 x half> %fma +; } + +define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { +; GFX6-LABEL: v_fma_v4f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX6-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v11 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v2, v2, v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_fma_f32 v3, v3, v6, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) + ret <4 x half> %fma +} + +define double @v_fma_f64(double %x, double %y, double %z) { +; GFX6-LABEL: v_fma_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call double @llvm.fma.f64(double %x, double %y, double %z) + ret double %fma +} + +define double @v_fma_f64_fneg_all(double %x, double %y, double %z) { +; GFX6-LABEL: v_fma_f64_fneg_all: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f64_fneg_all: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f64_fneg_all: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %neg.y = fneg double %y + %neg.z = fneg double %z + %fma = call double @llvm.fma.f64(double %neg.x, double %neg.y, double %neg.z) + ret double %fma +} + +define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) { +; GFX6-LABEL: v_fma_v2f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX6-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX8-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) + ret <2 x double> %fma +} + +define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fabs_lhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fabs_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fabs_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z) + ret float %fma +} + +define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fabs_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, |v1|, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, |v1|, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, |v1|, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call float @llvm.fabs.f32(float %y) + %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z) + ret float %fma +} + +define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %fabs.y = call float @llvm.fabs.f32(float %y) + %fma = call float @llvm.fma.f32(float %fabs.x, float %fabs.y, float %z) + ret float %fma +} + +define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float %z) { +; GFX6-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fma_f32 v0, v0, s0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_fma_f32 v0, v0, s0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_fma_f32 v0, v0, s0, v1 +; GFX9-NEXT: ; return to shader part epilog + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y, float inreg %z) { +; GFX6-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fneg_lhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg float %x + %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z) + ret float %fma +} + +define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fneg_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, -v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, -v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, -v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.y = fneg float %y + %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z) + ret float %fma +} + +define float @v_fma_f32_fneg_z(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fneg_z: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fneg_z: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fneg_z: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.z = fneg float %z + %fma = call float @llvm.fma.f32(float %x, float %y, float %neg.z) + ret float %fma +} + +declare half @llvm.fma.f16(half, half, half) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare double @llvm.fma.f64(double, double, double) #0 + +declare half @llvm.fabs.f16(half) #0 +declare float @llvm.fabs.f32(float) #0 + +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #0 +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0 + +declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #0 +declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } From d6fb02b196da16f77bf8cfede9296b3b51360798 Mon Sep 17 00:00:00 2001 From: Eric Fiselier Date: Tue, 24 Mar 2020 21:33:42 -0400 Subject: [PATCH 17/33] [libc++] Update a bad documentation link --- libcxx/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 6b2c4714dc0b0d..fa194d4bffd963 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -1,4 +1,5 @@ -# See www/CMake.html for instructions on how to build libcxx with CMake. +# See https://libcxx.llvm.org/docs/BuildingLibcxx.html for instructions on how +# to build libcxx with CMake. #=============================================================================== # Setup Project From 32bddad37bf01ea9151f7cef44f56bf4d3fe4f23 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Sat, 21 Mar 2020 16:41:00 -0700 Subject: [PATCH 18/33] [profile] Make atexit hook a no-op on Fuchsia On Fuchsia, we always use the continuous mode with runtime counter relocation, so there's no need for atexit hook or support for dumping the profile manually. Differential Revision: https://reviews.llvm.org/D76556 --- compiler-rt/lib/profile/InstrProfiling.h | 3 + .../lib/profile/InstrProfilingBuffer.c | 5 +- compiler-rt/lib/profile/InstrProfilingFile.c | 33 +++++++-- .../lib/profile/InstrProfilingInternal.h | 5 +- .../profile/InstrProfilingPlatformFuchsia.c | 71 +++++-------------- .../lib/profile/InstrProfilingRuntime.cpp | 4 +- 6 files changed, 54 insertions(+), 67 deletions(-) diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index a51cf38285ba3e..d7a7c32332c1b7 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -218,6 +218,9 @@ int __llvm_profile_register_write_file_atexit(void); /*! \brief Initialize file handling. */ void __llvm_profile_initialize_file(void); +/*! \brief Initialize the profile runtime. */ +void __llvm_profile_initialize(void); + /*! * \brief Return path prefix (excluding the base filename) of the profile data. * This is useful for users using \c -fprofile-generate=./path_prefix who do diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c index 489ba2929ccbc7..5ee44785a7ab90 100644 --- a/compiler-rt/lib/profile/InstrProfilingBuffer.c +++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c @@ -10,9 +10,6 @@ #include "InstrProfilingInternal.h" #include "InstrProfilingPort.h" -/* When counters are being relocated at runtime, this parameter is set to 1. */ -COMPILER_RT_VISIBILITY int RuntimeCounterRelocation = 0; - /* When continuous mode is enabled (%c), this parameter is set to 1. * * This parameter is defined here in InstrProfilingBuffer.o, instead of in @@ -66,7 +63,7 @@ void __llvm_profile_get_padding_sizes_for_counters( uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterNames) { if (!__llvm_profile_is_continuous_mode_enabled() || - RuntimeCounterRelocation) { + lprofRuntimeCounterRelocation()) { *PaddingBytesBeforeCounters = 0; *PaddingBytesAfterCounters = 0; *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize); diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index e0a29119e692bf..875de885b2ee74 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -35,6 +35,16 @@ #include "InstrProfilingPort.h" #include "InstrProfilingUtil.h" +static int RuntimeCounterRelocation = 0; + +COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { + return RuntimeCounterRelocation; +} + +COMPILER_RT_VISIBILITY void lprofSetRuntimeCounterRelocation(void) { + RuntimeCounterRelocation = 1; +} + /* From where is profile name specified. * The order the enumerators define their * precedence. Re-order them may lead to @@ -477,7 +487,8 @@ static int writeMMappedFile(FILE *OutputFile, char **Profile) { } static void relocateCounters(void) { - if (!__llvm_profile_is_continuous_mode_enabled() || !RuntimeCounterRelocation) + if (!__llvm_profile_is_continuous_mode_enabled() || + !lprofRuntimeCounterRelocation()) return; /* Get the sizes of various profile data sections. Taken from @@ -808,7 +819,7 @@ static void parseAndSetFilename(const char *FilenamePat, truncateCurrentFile(); if (__llvm_profile_is_continuous_mode_enabled()) { - if (RuntimeCounterRelocation) + if (lprofRuntimeCounterRelocation()) relocateCounters(); else initializeProfileForContinuousMode(); @@ -951,10 +962,10 @@ const char *__llvm_profile_get_filename(void) { return FilenameBuf; } -/* This method is invoked by the runtime initialization hook - * InstrProfilingRuntime.o if it is linked in. Both user specified +/* This API initializes the file handling, both user specified * profile path via -fprofile-instr-generate= and LLVM_PROFILE_FILE - * environment variable can override this default value. */ + * environment variable can override this default value. + */ COMPILER_RT_VISIBILITY void __llvm_profile_initialize_file(void) { const char *EnvFilenamePat; @@ -963,7 +974,7 @@ void __llvm_profile_initialize_file(void) { int hasCommandLineOverrider = (INSTR_PROF_PROFILE_NAME_VAR[0] != 0); if (__llvm_profile_counter_bias != -1) - RuntimeCounterRelocation = 1; + lprofSetRuntimeCounterRelocation(); EnvFilenamePat = getFilenamePatFromEnv(); if (EnvFilenamePat) { @@ -982,6 +993,16 @@ void __llvm_profile_initialize_file(void) { parseAndSetFilename(SelectedPat, PNS, 0); } +/* This method is invoked by the runtime initialization hook + * InstrProfilingRuntime.o if it is linked in. + */ +COMPILER_RT_VISIBILITY +void __llvm_profile_initialize(void) { + __llvm_profile_initialize_file(); + if (!__llvm_profile_is_continuous_mode_enabled()) + __llvm_profile_register_write_file_atexit(); +} + /* This API is directly called by the user application code. It has the * highest precedence compared with LLVM_PROFILE_FILE environment variable * and command line option -fprofile-instr-generate=. diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h index 6a8dffad50db9a..32b63955d93711 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.h +++ b/compiler-rt/lib/profile/InstrProfilingInternal.h @@ -184,7 +184,10 @@ uint64_t lprofGetLoadModuleSignature(); unsigned lprofProfileDumped(); void lprofSetProfileDumped(); -COMPILER_RT_VISIBILITY extern int RuntimeCounterRelocation; +/* Return non zero value if counters are being relocated at runtime. */ +unsigned lprofRuntimeCounterRelocation(void); +void lprofSetRuntimeCounterRelocation(void); + COMPILER_RT_VISIBILITY extern void (*FreeHook)(void *); COMPILER_RT_VISIBILITY extern uint8_t *DynamicBufferIOBuffer; COMPILER_RT_VISIBILITY extern uint32_t VPBufferSize; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c b/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c index 2d66be6de73e79..63c8b75b9d12c3 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c @@ -34,7 +34,11 @@ #include "InstrProfilingInternal.h" #include "InstrProfilingUtil.h" -/* VMO that contains the coverage data shared across all modules. */ +COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { + return 1; +} + +/* VMO that contains the profile data for this module. */ static zx_handle_t __llvm_profile_vmo; /* Current offset within the VMO where data should be written next. */ static uint64_t __llvm_profile_offset; @@ -92,43 +96,23 @@ static void initVMOWriter(ProfDataWriter *This) { This->WriterCtx = NULL; } -static int dump(void) { - if (lprofProfileDumped()) { - lprofWrite("LLVM Profile: data not published: already written.\n"); - return 0; - } - - /* Check if there is llvm/runtime version mismatch. */ - if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) { - lprofWrite("LLVM Profile: runtime and instrumentation version mismatch: " - "expected %d, but got %d\n", - INSTR_PROF_RAW_VERSION, - (int)GET_VERSION(__llvm_profile_get_version())); - return -1; - } - - /* Write the profile data into the mapped region. */ - ProfDataWriter VMOWriter; - initVMOWriter(&VMOWriter); - if (lprofWriteData(&VMOWriter, lprofGetVPDataReader(), 0) != 0) - return -1; - - return 0; -} - +/* This method is invoked by the runtime initialization hook + * InstrProfilingRuntime.o if it is linked in. */ COMPILER_RT_VISIBILITY -int __llvm_profile_dump(void) { - int rc = dump(); - lprofSetProfileDumped(); - return rc; -} - -static void dumpWithoutReturn(void) { dump(); } +void __llvm_profile_initialize(void) { + /* This symbol is defined as weak and initialized to -1 by the runtimer, but + * compiler will generate a strong definition initialized to 0 when runtime + * counter relocation is used. */ + if (__llvm_profile_counter_bias == -1) { + lprofWrite("LLVM Profile: counter relocation at runtime is required\n"); + return; + } -static void createVMO(void) { /* Don't create VMO if it has been alread created. */ - if (__llvm_profile_vmo != ZX_HANDLE_INVALID) + if (__llvm_profile_vmo != ZX_HANDLE_INVALID) { + lprofWrite("LLVM Profile: VMO has already been created\n"); return; + } const __llvm_profile_data *DataBegin = __llvm_profile_begin_data(); const __llvm_profile_data *DataEnd = __llvm_profile_end_data(); @@ -211,23 +195,4 @@ static void createVMO(void) { (uintptr_t)__llvm_profile_begin_counters() + CountersOffset; } -/* This method is invoked by the runtime initialization hook - * InstrProfilingRuntime.o if it is linked in. - */ -COMPILER_RT_VISIBILITY -void __llvm_profile_initialize_file(void) { createVMO(); } - -COMPILER_RT_VISIBILITY -int __llvm_profile_register_write_file_atexit(void) { - static bool HasBeenRegistered = false; - - if (HasBeenRegistered) - return 0; - - lprofSetupValueProfiler(); - - HasBeenRegistered = true; - return atexit(dumpWithoutReturn); -} - #endif diff --git a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp index 5dff09d7063293..4ea2bb263f5aca 100644 --- a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp +++ b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp @@ -19,9 +19,7 @@ namespace { class RegisterRuntime { public: RegisterRuntime() { - __llvm_profile_initialize_file(); - if (!__llvm_profile_is_continuous_mode_enabled()) - __llvm_profile_register_write_file_atexit(); + __llvm_profile_initialize(); } }; From baa78179fef9a238f7bd88a2a0e87d6453453fa9 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 24 Mar 2020 21:52:03 -0400 Subject: [PATCH 19/33] AMDGPU/GlobalISel: Add a testcase for G_UNMERGE_VALUES legalization I had a note that this doesn't work, but it seems to now. --- .../GlobalISel/legalize-unmerge-values.mir | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index 7313b56504a2c2..aa52a681c15689 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -656,6 +656,46 @@ body: | $vgpr1 = COPY %4 ... +--- +name: test_unmerge_s8_p3 +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_unmerge_s8_p3 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C1]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + ; CHECK: $vgpr1 = COPY [[COPY5]](s32) + ; CHECK: $vgpr2 = COPY [[COPY6]](s32) + ; CHECK: $vgpr3 = COPY [[COPY7]](s32) + %0:_(p3) = COPY $vgpr0 + %1:_(s8), %2:_(s8), %3:_(s8), %4:_(s8) = G_UNMERGE_VALUES %0 + %5:_(s32) = G_ANYEXT %1 + %6:_(s32) = G_ANYEXT %2 + %7:_(s32) = G_ANYEXT %3 + %8:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 + $vgpr1 = COPY %6 + $vgpr2 = COPY %7 + $vgpr3 = COPY %8 + +... + --- name: test_unmerge_s16_s64 body: | From 2488ea428d68fa3d1031bbb84b5d09009f37a42f Mon Sep 17 00:00:00 2001 From: QingShan Zhang Date: Wed, 25 Mar 2020 02:37:26 +0000 Subject: [PATCH 20/33] [NFC][Test][PowerPC] Add one test to verify the behavior of vector mul/add for v8i16 --- llvm/test/CodeGen/PowerPC/vmladduhm.ll | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/vmladduhm.ll diff --git a/llvm/test/CodeGen/PowerPC/vmladduhm.ll b/llvm/test/CodeGen/PowerPC/vmladduhm.ll new file mode 100644 index 00000000000000..f2475d9e249079 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vmladduhm.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-P8 +define <8 x i16> @mul(<8 x i16> %m, <8 x i16> %n) { +; CHECK-LABEL: mul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor 36, 36, 36 +; CHECK-NEXT: vmladduhm 2, 2, 3, 4 +; CHECK-NEXT: blr +entry: + %0 = mul <8 x i16> %m, %n + ret <8 x i16> %0 +} + +define <8 x i16> @madd(<8 x i16> %m, <8 x i16> %n, <8 x i16> %o) { +; CHECK-LABEL: madd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor 37, 37, 37 +; CHECK-NEXT: vmladduhm 2, 2, 3, 5 +; CHECK-NEXT: vadduhm 2, 2, 4 +; CHECK-NEXT: blr +entry: + %0 = mul <8 x i16> %m, %n + %1 = add <8 x i16> %0, %o + ret <8 x i16> %1 +} From 64a9c944fc459475b3eefed423e3b593353c0cca Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 24 Mar 2020 22:53:47 -0400 Subject: [PATCH 21/33] Revert "[libc++] Build the dylib with C++17 to allow aligned new/delete" This reverts commit 1ac403bd145dadfa1004af29bd6c77f871caf42c, which broke some non-libc++ build bots because they use an ancient CMake. --- libcxx/CMakeLists.txt | 6 +++--- libcxx/src/barrier.cpp | 12 +++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index fa194d4bffd963..8bdb24e5fd2af7 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -520,10 +520,10 @@ remove_flags(-Wno-pedantic -pedantic-errors -pedantic) # Required flags ============================================================== function(cxx_add_basic_build_flags target) - # Require C++17 for all targets. C++17 is needed to use aligned allocation - # in the dylib. + # Require C++14 for all targets. C++14 is needed to ensure constant + # initialization for certain globals (ex global memory resources). set_target_properties(${target} PROPERTIES - CXX_STANDARD 17 + CXX_STANDARD 14 CXX_STANDARD_REQUIRED YES CXX_EXTENSIONS NO) diff --git a/libcxx/src/barrier.cpp b/libcxx/src/barrier.cpp index 9ee476993b8133..c5e33cbba3bd0b 100644 --- a/libcxx/src/barrier.cpp +++ b/libcxx/src/barrier.cpp @@ -26,15 +26,21 @@ class __barrier_algorithm_base { } __tickets[64]; }; - ptrdiff_t& __expected; - unique_ptr<__state_t[]> __state; + ptrdiff_t& __expected; + unique_ptr __state_allocation; + __state_t* __state; _LIBCPP_HIDDEN __barrier_algorithm_base(ptrdiff_t& __expected) : __expected(__expected) { size_t const __count = (__expected + 1) >> 1; - __state = unique_ptr<__state_t[]>(new __state_t[__count]); + size_t const __size = sizeof(__state_t) * __count; + size_t __allocation_size = __size + alignof(__state_t); + __state_allocation = unique_ptr(new char[__allocation_size]); + void* __allocation = __state_allocation.get(); + void* const __state_ = align(alignof(__state_t), __size, __allocation, __allocation_size); + __state = new (__state_) __barrier_algorithm_base::__state_t[__count]; } _LIBCPP_HIDDEN bool __arrive(__barrier_phase_t __old_phase) From ba1f4405c6828308649b994ecb2d056fabc80a8c Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Tue, 24 Mar 2020 20:02:44 -0700 Subject: [PATCH 22/33] [profile] Move RuntimeCounterRelocation and ProfileDumped into a separate file This avoids the test failure that was introduced in rG32bddad where this function pulls in the rest of InstrProfilingFile.c which is undesirable in use cases when profile runtime is being used without the rest of libc. This also allows additional cleanup by eliminating another variable from platforms that don't need it. Differential Revision: https://reviews.llvm.org/D76750 --- compiler-rt/lib/profile/CMakeLists.txt | 1 + compiler-rt/lib/profile/InstrProfiling.c | 14 ++------ compiler-rt/lib/profile/InstrProfilingFile.c | 14 ++------ .../lib/profile/InstrProfilingInternal.c | 33 +++++++++++++++++++ .../lib/profile/InstrProfilingInternal.h | 6 ++-- .../profile/InstrProfilingPlatformFuchsia.c | 6 ++++ 6 files changed, 47 insertions(+), 27 deletions(-) create mode 100644 compiler-rt/lib/profile/InstrProfilingInternal.c diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 8cedbcf4f806c8..ece674b2daa148 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -51,6 +51,7 @@ add_compiler_rt_component(profile) set(PROFILE_SOURCES GCDAProfiling.c InstrProfiling.c + InstrProfilingInternal.c InstrProfilingValue.c InstrProfilingBiasVar.c InstrProfilingBuffer.c diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c index 087d1cdd2efe14..31a9fe99629319 100644 --- a/compiler-rt/lib/profile/InstrProfiling.c +++ b/compiler-rt/lib/profile/InstrProfiling.c @@ -25,18 +25,8 @@ COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_magic(void) { : (INSTR_PROF_RAW_MAGIC_32); } -static unsigned ProfileDumped = 0; - -COMPILER_RT_VISIBILITY unsigned lprofProfileDumped() { - return ProfileDumped; -} - -COMPILER_RT_VISIBILITY void lprofSetProfileDumped() { - ProfileDumped = 1; -} - COMPILER_RT_VISIBILITY void __llvm_profile_set_dumped() { - lprofSetProfileDumped(); + lprofSetProfileDumped(1); } /* Return the number of bytes needed to add to SizeInBytes to make it @@ -80,5 +70,5 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) { } } } - ProfileDumped = 0; + lprofSetProfileDumped(0); } diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index 875de885b2ee74..9e1a54a0c3737c 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -35,16 +35,6 @@ #include "InstrProfilingPort.h" #include "InstrProfilingUtil.h" -static int RuntimeCounterRelocation = 0; - -COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { - return RuntimeCounterRelocation; -} - -COMPILER_RT_VISIBILITY void lprofSetRuntimeCounterRelocation(void) { - RuntimeCounterRelocation = 1; -} - /* From where is profile name specified. * The order the enumerators define their * precedence. Re-order them may lead to @@ -974,7 +964,7 @@ void __llvm_profile_initialize_file(void) { int hasCommandLineOverrider = (INSTR_PROF_PROFILE_NAME_VAR[0] != 0); if (__llvm_profile_counter_bias != -1) - lprofSetRuntimeCounterRelocation(); + lprofSetRuntimeCounterRelocation(1); EnvFilenamePat = getFilenamePatFromEnv(); if (EnvFilenamePat) { @@ -1072,7 +1062,7 @@ int __llvm_profile_dump(void) { "in profile name or change profile name before dumping.\n", "online profile merging is not on"); int rc = __llvm_profile_write_file(); - lprofSetProfileDumped(); + lprofSetProfileDumped(1); return rc; } diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.c b/compiler-rt/lib/profile/InstrProfilingInternal.c new file mode 100644 index 00000000000000..d58bc19ad11e63 --- /dev/null +++ b/compiler-rt/lib/profile/InstrProfilingInternal.c @@ -0,0 +1,33 @@ +/*===- InstrProfilingInternal.c - Support library for PGO instrumentation -===*\ +|* +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +|* See https://llvm.org/LICENSE.txt for license information. +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +|* +\*===----------------------------------------------------------------------===*/ + +#if !defined(__Fuchsia__) + +#include "InstrProfilingInternal.h" + +static unsigned ProfileDumped = 0; + +COMPILER_RT_VISIBILITY unsigned lprofProfileDumped() { + return ProfileDumped; +} + +COMPILER_RT_VISIBILITY void lprofSetProfileDumped(unsigned Value) { + ProfileDumped = Value; +} + +static unsigned RuntimeCounterRelocation = 0; + +COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { + return RuntimeCounterRelocation; +} + +COMPILER_RT_VISIBILITY void lprofSetRuntimeCounterRelocation(unsigned Value) { + RuntimeCounterRelocation = Value; +} + +#endif diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h index 32b63955d93711..904bd394592812 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.h +++ b/compiler-rt/lib/profile/InstrProfilingInternal.h @@ -181,12 +181,12 @@ uint64_t lprofGetLoadModuleSignature(); * Return non zero value if the profile data has already been * dumped to the file. */ -unsigned lprofProfileDumped(); -void lprofSetProfileDumped(); +unsigned lprofProfileDumped(void); +void lprofSetProfileDumped(unsigned); /* Return non zero value if counters are being relocated at runtime. */ unsigned lprofRuntimeCounterRelocation(void); -void lprofSetRuntimeCounterRelocation(void); +void lprofSetRuntimeCounterRelocation(unsigned); COMPILER_RT_VISIBILITY extern void (*FreeHook)(void *); COMPILER_RT_VISIBILITY extern uint8_t *DynamicBufferIOBuffer; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c b/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c index 63c8b75b9d12c3..828f7422139362 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c @@ -34,9 +34,15 @@ #include "InstrProfilingInternal.h" #include "InstrProfilingUtil.h" +COMPILER_RT_VISIBILITY unsigned lprofProfileDumped() { + return 1; +} +COMPILER_RT_VISIBILITY void lprofSetProfileDumped(unsigned Value) {} + COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { return 1; } +COMPILER_RT_VISIBILITY void lprofSetRuntimeCounterRelocation(unsigned Value) {} /* VMO that contains the profile data for this module. */ static zx_handle_t __llvm_profile_vmo; From c2273883e23a3278b5da562ba5827462afef2ae3 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 25 Mar 2020 03:27:56 +0000 Subject: [PATCH 23/33] [gn build] Port ba1f4405c68 --- llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn index 777b2da40be8cd..f5fc87c102682b 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn @@ -37,6 +37,7 @@ static_library("profile") { "InstrProfilingBiasVar.c", "InstrProfilingBuffer.c", "InstrProfilingFile.c", + "InstrProfilingInternal.c", "InstrProfilingInternal.h", "InstrProfilingMerge.c", "InstrProfilingMergeFile.c", From 8786cdb3cdfbe5f7a823a122431e1bea675c81f4 Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Wed, 25 Mar 2020 07:53:05 +0530 Subject: [PATCH 24/33] [MLIR][NFC] std::is_same || -> llvm::is_one_of Switch std::is_same disjunctions to llvm::is_one_of Differential Revision: https://reviews.llvm.org/D76745 --- mlir/lib/Analysis/LoopAnalysis.cpp | 6 +++--- mlir/lib/Analysis/Utils.cpp | 16 ++++++++-------- mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 14 +++++--------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/mlir/lib/Analysis/LoopAnalysis.cpp b/mlir/lib/Analysis/LoopAnalysis.cpp index 9cf8fbc815d2a9..9b5725188faeab 100644 --- a/mlir/lib/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Analysis/LoopAnalysis.cpp @@ -220,9 +220,9 @@ DenseSet mlir::getInvariantAccesses(Value iv, ArrayRef indices) { template static bool isContiguousAccess(Value iv, LoadOrStoreOp memoryOp, int *memRefDim) { - static_assert(std::is_same::value || - std::is_same::value, - "Must be called on either const LoadOp & or const StoreOp &"); + static_assert( + llvm::is_one_of::value, + "Must be called on either LoadOp or StoreOp"); assert(memRefDim && "memRefDim == nullptr"); auto memRefType = memoryOp.getMemRefType(); diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index 940e07630e2045..9e400e4b6a3c47 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -368,16 +368,16 @@ Optional mlir::getMemRefSizeInBytes(MemRefType memRefType) { return sizeInBytes; } -template -LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp, +template +LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp, bool emitError) { - static_assert(std::is_same::value || - std::is_same::value, - "argument should be either a AffineLoadOp or a AffineStoreOp"); + static_assert( + llvm::is_one_of::value, + "argument should be either a AffineLoadOp or a AffineStoreOp"); - Operation *opInst = loadOrStoreOp.getOperation(); - MemRefRegion region(opInst->getLoc()); - if (failed(region.compute(opInst, /*loopDepth=*/0, /*sliceState=*/nullptr, + Operation *op = loadOrStoreOp.getOperation(); + MemRefRegion region(op->getLoc()); + if (failed(region.compute(op, /*loopDepth=*/0, /*sliceState=*/nullptr, /*addMemRefDimBounds=*/false))) return success(); diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 9e1f7e96bb0b1d..7bf8ab4e07a58b 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -629,8 +629,7 @@ static void canonicalizePromotedSymbols(MapOrSet *mapOrSet, template static void canonicalizeMapOrSetAndOperands(MapOrSet *mapOrSet, SmallVectorImpl *operands) { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(llvm::is_one_of::value, "Argument must be either of AffineMap or IntegerSet type"); if (!mapOrSet || operands->empty()) @@ -729,13 +728,10 @@ struct SimplifyAffineOp : public OpRewritePattern { LogicalResult matchAndRewrite(AffineOpTy affineOp, PatternRewriter &rewriter) const override { - static_assert(std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value, - "affine load/store/apply op expected"); + static_assert(llvm::is_one_of::value, + "affine load/store/apply/prefetch/min/max op expected"); auto map = affineOp.getAffineMap(); AffineMap oldMap = map; auto oldOperands = affineOp.getMapOperands(); From 2c080a3f23e49edddb0799a569d7d3319fc91d86 Mon Sep 17 00:00:00 2001 From: Siva Chandra Reddy Date: Tue, 24 Mar 2020 10:52:56 -0700 Subject: [PATCH 25/33] [libc] Generate math.h instead of the static file it is currently. Just enough to help a follow up patch adding cosf and sinf has been added. Reviewers: abrachet Differential Revision: https://reviews.llvm.org/D76723 --- libc/config/linux/api.td | 47 ++++- libc/include/CMakeLists.txt | 6 +- libc/include/math.h | 360 ------------------------------------ libc/include/math.h.def | 16 ++ libc/spec/stdc.td | 19 +- 5 files changed, 80 insertions(+), 368 deletions(-) delete mode 100644 libc/include/math.h create mode 100644 libc/include/math.h.def diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index d9c2470b6e52c9..f520947b998ab1 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -71,10 +71,53 @@ def AssertAPI : PublicAPI<"assert.h"> { ]; } +def MathErrHandlingMacro : MacroDef<"math_errhandling"> { + let Defn = [{ + #ifndef math_errhandling + #ifdef __FAST_MATH__ + #define math_errhandling 0 + #elif defined __NO_MATH_ERRNO__ + #define math_errhandling (MATH_ERREXCEPT) + #else + #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT) + #endif + #endif // math_errhandling not defined + }]; +} + +def IsFiniteMacro : MacroDef<"isfinite"> { + let Defn = [{ + #define isfinite(x) __builtin_isfinite(x) + }]; +} + +def IsInfMacro : MacroDef<"isinf"> { + let Defn = [{ + #define isinf(x) __builtin_isinf(x) + }]; +} + +def IsNanMacro : MacroDef<"isnan"> { + let Defn = [{ + #define isnan(x) __builtin_isnan(x) + }]; +} + def MathAPI : PublicAPI<"math.h"> { + let Macros = [ + SimpleMacroDef<"MATH_ERRNO", "1">, + SimpleMacroDef<"MATH_ERREXCEPT", "2">, + MathErrHandlingMacro, + + SimpleMacroDef<"INFINITY", "__builtin_inff()">, + SimpleMacroDef<"NAN", "__builtin_nanf(\"\")">, + + IsFiniteMacro, + IsInfMacro, + IsNanMacro, + ]; let Functions = [ - "acos", - "acosl", + "round", ]; } diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 2ff04231e4060c..8e48a636a8ea46 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -19,10 +19,10 @@ add_header( llvm_libc_common_h ) -add_header( +add_gen_header( math_h - HDR - math.h + DEF_FILE math.h.def + GEN_HDR math.h DEPENDS llvm_libc_common_h ) diff --git a/libc/include/math.h b/libc/include/math.h deleted file mode 100644 index 6cd258b3c633e9..00000000000000 --- a/libc/include/math.h +++ /dev/null @@ -1,360 +0,0 @@ -//===----------------- C standard library header math.h -----------------*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_MATH_H -#define LLVM_LIBC_MATH_H - -#include <__llvm-libc-common.h> - -__BEGIN_C_DECLS - -double acos(double); - -float acosf(float); - -long double acosl(long double); - -double asin(double); - -float asinf(float); - -long double asinl(long double); - -double atan(double); - -float atanf(float); - -long double atanl(long double); - -double atan2(double, double); - -float atan2f(float, float); - -long double atan2l(long double, long double); - -double cos(double); - -float cosf(float); - -long double cosl(long double); - -double sin(double); - -float sinf(float); - -long double sinl(long double); - -double tan(double); - -float tanf(float); - -long double tanl(long double); - -double acosh(double); - -float acoshf(float); - -long double acoshl(long double); - -double asinh(double); - -float asinhf(float); - -long double asinhl(long double); - -double atanh(double); - -float atanhf(float); - -long double atanhl(long double); - -double cosh(double); - -float coshf(float); - -long double coshl(long double); - -double sinh(double); - -float sinhf(float); - -long double sinhl(long double); - -double tanh(double); - -float tanhf(float); - -long double tanhl(long double); - -double exp(double); - -float expf(float); - -long double expl(long double); - -double exp2(double); - -float exp2f(float); - -long double exp2l(long double); - -double expm1(double); - -float expm1f(float); - -long double expm1l(long double); - -double frexp(double, int); - -float frexpf(float, int); - -long double frexpl(long double, int); - -int ilogb(double); - -int ilogbf(float); - -int ilogbl(long double); - -double ldexp(double, int); - -float ldexpf(float, int); - -long double ldexpl(long double, int); - -double log(double); - -float logf(float); - -long double logl(long double); - -double log10(double); - -float log10f(float); - -long double log10l(long double); - -double log1p(double); - -float log1pf(float); - -long double log1pl(long double); - -double log2(double); - -float log2f(float); - -long double log2l(long double); - -double logb(double); - -float logbf(float); - -long double logbl(long double); - -double modf(double, double); - -float modff(float, float); - -long double modfl(long double, long double); - -double scalbn(double, int); - -float scalbnf(float, int); - -long double scalbnl(long double, int); - -double scalbln(double, long int); - -float scalblnf(float, long int); - -long double scalblnl(long double, long int); - -double cbrt(double); - -float cbrtf(float); - -long double cbrtl(long double); - -double fabs(double); - -float fabsf(float); - -long double fabsl(long double); - -double hypot(double, double); - -float hypotf(float, float); - -long double hypotl(long double, long double); - -double pow(double, double); - -float powf(float, float); - -long double powl(long double, long double); - -double sqrt(double); - -float sqrtf(float); - -long double sqrtl(long double); - -double erf(double); - -float erff(float); - -long double erfl(long double); - -double erfc(double); - -float erfcf(float); - -long double erfcl(long double); - -double lgamma(double); - -float lgammaf(float); - -long double lgammal(long double); - -double tgamma(double); - -float tgammaf(float); - -long double tgammal(long double); - -double ceil(double); - -float ceilf(float); - -long double ceill(long double); - -double floor(double); - -float floorf(float); - -long double floorl(long double); - -double nearbyint(double); - -float nearbyintf(float); - -long double nearbyintl(long double); - -double rint(double); - -float rintf(float); - -long double rintl(long double); - -long int lrint(double); - -long int lrintf(float); - -long int lrintl(long double); - -long long int llrint(double); - -long long int llrintf(float); - -long long int llrintl(long double); - -double round(double); - -float roundf(float); - -long double roundl(long double); - -long int lround(double); - -long int lroundf(float); - -long int lroundl(long double); - -long long int llround(double); - -long long int llroundf(float); - -long long int llroundl(long double); - -double trunc(double); - -float truncf(float); - -long double truncl(long double); - -double fmod(double, double); - -float fmodf(float, float); - -long double fmodl(long double, long double); - -double remainder(double, double); - -float remainderf(float, float); - -long double remainderl(long double, long double); - -double remquo(double, double, int); - -float remquof(float, float, int); - -long double remquol(long double, long double, int); - -double copysign(double, double); - -float copysignf(float, float); - -long double copysignl(long double, long double); - -double nan(const char); - -float nanf(const char); - -long double nanl(const char); - -double nextafter(double, double); - -float nextafterf(float, float); - -long double nextafterl(long double, long double); - -double nexttoward(double, long double); - -float nexttowardf(float, long double); - -long double nexttowardl(long double, long double); - -double fdim(double, double); - -float fdimf(float, float); - -long double fdiml(long double, long double); - -double fmax(double, double); - -double fmaxf(double, double); - -double fmaxl(double, double); - -double fmin(double, double); - -float fminf(float, float); - -long double fminl(long double, long double); - -double fma(double, double, double); - -float fmaf(float, float, float); - -long double fmal(long double, long double, long double); - -__END_C_DECLS - -#endif // LLVM_LIBC_MATH_H diff --git a/libc/include/math.h.def b/libc/include/math.h.def new file mode 100644 index 00000000000000..7ede8545cd5e6c --- /dev/null +++ b/libc/include/math.h.def @@ -0,0 +1,16 @@ +//===----------------- C standard library header math.h -----------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_MATH_H +#define LLVM_LIBC_MATH_H + +#include <__llvm-libc-common.h> + +%%public_api() + +#endif // LLVM_LIBC_MATH_H diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 8505b70c7eee2c..39067bcdda12d1 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -160,15 +160,28 @@ def StdC : StandardSpec<"stdc"> { HeaderSpec Math = HeaderSpec< "math.h", - [], // Macros + [ + Macro<"MATH_ERRNO">, + Macro<"MATH_ERREXCEPT">, + Macro<"math_errhandling">, + + Macro<"INFINITY">, + Macro<"NAN">, + + Macro<"isfinite">, + Macro<"isinf">, + Macro<"isnan">, + ], [ NamedType<"float_t">, NamedType<"double_t">, ], [], // Enumerations [ - FunctionSpec<"acos", RetValSpec, [ArgSpec]>, - FunctionSpec<"acosl", RetValSpec, [ArgSpec]>, + FunctionSpec<"cosf", RetValSpec, [ArgSpec]>, + FunctionSpec<"sinf", RetValSpec, [ArgSpec]>, + + FunctionSpec<"round", RetValSpec, [ArgSpec]>, ] >; From 70f9f4dd9d19ed2cec0d9adf60fede9401898b85 Mon Sep 17 00:00:00 2001 From: Kai Luo Date: Wed, 25 Mar 2020 06:19:59 +0000 Subject: [PATCH 26/33] [PowerPC] Pre-commit reduced test case for PR45297. NFC. --- llvm/test/CodeGen/PowerPC/pr45297.ll | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/pr45297.ll diff --git a/llvm/test/CodeGen/PowerPC/pr45297.ll b/llvm/test/CodeGen/PowerPC/pr45297.ll new file mode 100644 index 00000000000000..71c19744fb7068 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr45297.ll @@ -0,0 +1,10 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mattr=+altivec -mattr=-power8-vector -mattr=-vsx < %s | FileCheck %s +; XFAIL: * + +define dso_local void @test(float %0) local_unnamed_addr { +entry: + %1 = fptosi float %0 to i32 + store i32 %1, i32* undef, align 4 + ret void +} From 2093fdd429d52348e08969180ac6b1e705fc4ff6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 24 Mar 2020 19:03:36 -0700 Subject: [PATCH 27/33] [X86] Split masked instruction tests to enable D60940. We need to split tests that rely on isel duplicating operations for different masking conditions. Repeating the operation is more costly than emitting the masking separately. The change here is a mechanical splitting of tests that call multiple intrinsics in one function into separate functions that call one intrinsic. We could obviously avoid the splitting by giving the intrinsics different operands, but that would need closer scrutiny than just splitting. --- .../CodeGen/X86/avx512-intrinsics-upgrade.ll | 2388 +++++---- llvm/test/CodeGen/X86/avx512-intrinsics.ll | 494 +- .../X86/avx512bw-intrinsics-upgrade.ll | 826 ++-- llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 234 +- .../X86/avx512bwvl-intrinsics-upgrade.ll | 2026 +++++--- .../test/CodeGen/X86/avx512bwvl-intrinsics.ll | 344 +- .../X86/avx512dq-intrinsics-upgrade.ll | 439 +- .../X86/avx512dqvl-intrinsics-upgrade.ll | 334 +- .../test/CodeGen/X86/avx512dqvl-intrinsics.ll | 429 +- .../X86/avx512vbmi-intrinsics-upgrade.ll | 140 +- .../test/CodeGen/X86/avx512vbmi-intrinsics.ll | 150 +- .../X86/avx512vbmivl-intrinsics-upgrade.ll | 280 +- .../CodeGen/X86/avx512vbmivl-intrinsics.ll | 300 +- .../X86/avx512vl-intrinsics-upgrade.ll | 4303 +++++++++++------ llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 892 ++-- 15 files changed, 8641 insertions(+), 4938 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 888829e8875222..851ce75b4632f5 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -26,189 +26,293 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) { ret i16 %res } +define <16 x i32>@test_int_x86_avx512_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcastd_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0x44,0x24,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcastd_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: vpbroadcastd %edi, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x4c,0x24,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0x44,0x24,0x01] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastd %edi, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastd %edi, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7c,0xc7] -; X64-NEXT: vpbroadcastd %edi, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xd7] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 - } + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_gpr_512(i32 %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0x44,0x24,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) +define <8 x i64>@test_int_x86_avx512_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcastq_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcastq_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: vpbroadcastq %rdi, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + ret <8 x i64> %res +} define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x8c,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0x84,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastq %rdi, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x7c,0xc7] -; X64-NEXT: vpbroadcastq %rdi, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7c,0xd7] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_gpr_512(i64 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + ret <8 x i64> %res } + declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly -define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { -; X86-LABEL: test_x86_vbroadcast_ss_ps_512: +define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1) + ret <16 x float> %res +} + +define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_ss_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vbroadcastss %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x18,0xc8] -; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] -; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0] -; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_ss_ps_512: +; X64-LABEL: test_x86_mask_vbroadcast_ss_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x18,0xc8] -; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_ss_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_ss_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0] -; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly -define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_sd_pd_512: +define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1) + ret <8 x double> %res +} + +define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_sd_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vbroadcastsd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x19,0xc8] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] -; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_sd_pd_512: +; X64-LABEL: test_x86_mask_vbroadcast_sd_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x19,0xc8] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_sd_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_sd_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res } declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) -define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastd_512: +define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastd_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) -define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastq_512: +define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastq_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) + ret <8 x i64> %res } declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) @@ -285,113 +389,158 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_movddup_512(<8 x double> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_movddup_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovddup %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xc0] +; CHECK-NEXT: ## zmm0 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_movddup_512: ; X86: ## %bb.0: -; X86-NEXT: vmovddup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xd0] -; X86-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovddup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x49,0x12,0xc8] ; X86-NEXT: ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; X86-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xca] -; X86-NEXT: vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_movddup_512: ; X64: ## %bb.0: -; X64-NEXT: vmovddup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xd0] -; X64-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x49,0x12,0xc8] ; X64-NEXT: ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; X64-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xca] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_movddup_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_movddup_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) + ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) -define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) { +define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_df_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xc0,0x03] +; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_df_512: ; X86: ## %bb.0: -; X86-NEXT: vpermpd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xd0,0x03] -; X86-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x01,0xc8,0x03] ; X86-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_df_512: ; X64: ## %bb.0: -; X64-NEXT: vpermpd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xd0,0x03] -; X64-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] -; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x01,0xc8,0x03] ; X64-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_df_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_df_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_di_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xc0,0x03] +; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_di_512: ; X86: ## %bb.0: -; X86-NEXT: vpermq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xd0,0x03] -; X86-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x00,0xc8,0x03] ; X86-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_di_512: ; X64: ## %bb.0: -; X64-NEXT: vpermq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xd0,0x03] -; X64-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x00,0xc8,0x03] ; X64-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_di_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_di_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res } define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { @@ -830,111 +979,154 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilpd $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xc0,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[0,1,3,2,5,4,6,6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilpd $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xd0,0x16] -; X86-NEXT: ## zmm2 = zmm0[0,1,3,2,5,4,6,6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16] ; X86-NEXT: ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] -; X86-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] -; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilpd $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xd0,0x16] -; X64-NEXT: ## zmm2 = zmm0[0,1,3,2,5,4,6,6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16] ; X64-NEXT: ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] -; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilps $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilps $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xd0,0x16] -; X86-NEXT: ## zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16] ; X86-NEXT: ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] -; X86-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] -; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilps $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xd0,0x16] -; X64-NEXT: ## zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16] ; X64-NEXT: ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] -; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilps $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x03] +; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpshufd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xd0,0x03] -; X86-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x70,0xc8,0x03] ; X86-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; X86-NEXT: vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpshufd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xd0,0x03] -; X64-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x70,0xc8,0x03] ; X64-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshuf_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshuf_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res } define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { @@ -1083,238 +1275,281 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vunpckhpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xd9] -; X86-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x15,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vunpckhpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xd9] -; X64-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x15,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vunpckhps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xd9] -; X86-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vunpckhps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x15,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vunpckhps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xd9] -; X64-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x15,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vunpcklpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x14,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vunpcklpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x14,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vunpcklps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vunpcklps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x14,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vunpcklps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x14,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6c,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6c,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_punpcklqd_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_punpcklqd_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xd9] -; X86-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6d,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xd9] -; X64-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6d,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xd9] -; X86-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6a,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xd9] -; X64-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6a,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckld_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpckldq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x62,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpckldq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x62,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) { @@ -2637,387 +2872,471 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f32x4: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f32x4: ; X86: ## %bb.0: -; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x23,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f32x4: ; X64: ## %bb.0: -; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x23,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f64x2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f64x2: ; X86: ## %bb.0: -; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x23,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f64x2: ; X64: ## %bb.0: -; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x23,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] -; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 +define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_f64x2: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_f64x2: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] +; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i32x4: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i32x4: ; X86: ## %bb.0: -; X86-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x43,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i32x4: ; X64: ## %bb.0: -; X64-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x43,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i64x2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i64x2: ; X86: ## %bb.0: -; X86-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x43,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i64x2: ; X64: ## %bb.0: -; X64-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x43,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] -; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 +define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] +; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufps $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vshufps $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vshufps $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vshufps $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufps $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3d,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3d,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3d,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3d,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3f,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3f,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3f,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3f,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpminsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x39,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpminsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x39,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x39,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x39,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminud %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpminud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3b,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpminud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3b,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3b,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3b,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { @@ -3098,340 +3417,489 @@ declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, < declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxbd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xc0] +; CHECK-NEXT: ## zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xd0] -; X86-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x31,0xc8] ; X86-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; X86-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xd0] -; X64-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x31,0xc8] ; X64-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxbq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xc0] +; CHECK-NEXT: ## zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xd0] -; X86-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x32,0xc8] ; X86-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xd0] -; X64-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x32,0xc8] ; X64-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxdq %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xc0] +; CHECK-NEXT: ## zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xd0] -; X86-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x35,0xc8] ; X86-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X86-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xd0] -; X64-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x35,0xc8] ; X64-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xc0] +; CHECK-NEXT: ## zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xd0] -; X86-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x33,0xc8] ; X86-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; X86-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xd0] -; X64-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x33,0xc8] ; X64-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxwq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xc0] +; CHECK-NEXT: ## zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd0] -; X86-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x34,0xc8] ; X86-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd0] -; X64-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x34,0xc8] ; X64-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x21,0xc8] -; X86-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x21,0xc8] -; X64-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x22,0xc8] -; X86-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x22,0xc8] -; X64-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x25,0xc8] -; X86-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x25,0xc8] -; X64-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } - declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x23,0xc8] -; X86-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x23,0xc8] -; X64-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res } +define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res +} declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x24,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x24,0xc8] -; X64-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>) @@ -4464,52 +4932,62 @@ define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512: ; X86: ## %bb.0: -; X86-NEXT: vcvtdq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8] -; X86-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512: ; X64: ## %bb.0: -; X64-NEXT: vcvtdq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8] -; X64-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512: ; X86: ## %bb.0: -; X86-NEXT: vcvtudq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8] -; X86-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512: ; X64: ## %bb.0: -; X64-NEXT: vcvtudq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8] -; X64-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) { @@ -4637,63 +5115,95 @@ declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i3 declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x0d,0xd1] -; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0x58,0xc0] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x0d,0xd1] -; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0x58,0xc0] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0xd1] -; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0xd1] -; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + ret <16 x float> %res +} + + +define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res } ; Test case to make sure we can print shuffle decode comments for constant pool loads. @@ -4703,14 +5213,14 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_0, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI291_0, kind: FK_Data_4 ; X86-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_1, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI291_1, kind: FK_Data_4 ; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] ; X86-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_2, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI291_2, kind: FK_Data_4 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -4719,14 +5229,14 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_0-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI291_0-4, kind: reloc_riprel_4byte ; X64-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_1-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI291_1-4, kind: reloc_riprel_4byte ; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] ; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_2-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI291_2-4, kind: reloc_riprel_4byte ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) @@ -5153,124 +5663,186 @@ declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf32x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf32x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x18,0xd1,0x01] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xd3] -; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] -; X86-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf32x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x18,0xd1,0x01] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xd3] -; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] -; X64-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf32x4_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf32x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_inserti32x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti32x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x38,0xd1,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] -; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti32x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x38,0xd1,0x01] -; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] -; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti32x4_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti32x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + ret <16 x i32> %res } declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf64x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xd1,0x01] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] -; X86-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf64x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xd1,0x01] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] -; X64-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf64x4_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf64x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti64x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xd1,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] -; X86-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti64x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xd1,0x01] -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] -; X64-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti64x4_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti64x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + ret <8 x i64> %res } define <8 x i64> @test_x86_avx512_movntdqa(i8* %a0) { @@ -5817,38 +6389,59 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float> ret <16 x float> %res } -declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; X86: ## %bb.0: +; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; X64: ## %bb.0: +; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf64x4_512: ; X86: ## %bb.0: ; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] ; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf64x4_512: ; X64: ## %bb.0: ; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] ; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) - %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res1, %res2 - %res5 = fadd <8 x double> %res3, %res4 - ret <8 x double> %res5 + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res } define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { @@ -5931,36 +6524,57 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: ; X86: ## %bb.0: ; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: ; X64: ## %bb.0: ; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti64x4_512: +; X86: ## %bb.0: +; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti64x4_512: +; X64: ## %bb.0: +; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res1, %res2 - %res5 = add <8 x i64> %res3, %res4 - ret <8 x i64> %res5 + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res } define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { @@ -6768,150 +7382,213 @@ declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X86: ## %bb.0: -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x16,0xd0] -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X64: ## %bb.0: -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x16,0xd0] -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X86: ## %bb.0: -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x36,0xd0] -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X64: ## %bb.0: -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x36,0xd0] -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res } declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X86: ## %bb.0: -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x16,0xd0] -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X64: ## %bb.0: -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x16,0xd0] -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_si_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X86: ## %bb.0: -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x36,0xd0] -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X64: ## %bb.0: -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x36,0xd0] -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res } declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) +define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) @@ -6919,52 +7596,45 @@ declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) +define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) @@ -6996,110 +7666,123 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %x2 = load <16 x i32>, <16 x i32>* %x2p + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x18] -; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] +; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x1f] -; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] +; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %x2 = load <16 x i32>, <16 x i32>* %x2p %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vmovapd %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd8] -; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x77,0xca] -; X86-NEXT: vaddpd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vmovapd %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd8] -; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x77,0xca] -; X64-NEXT: vaddpd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vmovaps %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8] -; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x77,0xca] -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vmovaps %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8] -; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x77,0xca] -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x76,0xca] -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x76,0xca] -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) @@ -7109,25 +7792,17 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd1] -; X86-NEXT: vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7e,0x10] -; X86-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xc9] -; X86-NEXT: vpaddd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc1] +; X86-NEXT: vpermi2d (%eax), %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x76,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovdqa64 %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd1] -; X64-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7e,0x17] -; X64-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xc9] -; X64-NEXT: vpaddd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc1] +; X64-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x76,0x07] ; X64-NEXT: retq ## encoding: [0xc3] %x2 = load <16 x i32>, <16 x i32>* %x2p %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) @@ -7138,27 +7813,19 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] ; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vmovapd %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd1] -; X86-NEXT: vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x7f,0x10] -; X86-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7f,0xc9] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc1] +; X86-NEXT: vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xd9,0x77,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovapd %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd1] -; X64-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x7f,0x17] -; X64-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7f,0xc9] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc1] +; X64-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xd9,0x77,0x07] ; X64-NEXT: retq ## encoding: [0xc3] %x2s = load double, double* %x2ptr %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) @@ -7166,25 +7833,17 @@ declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vmovaps %zmm1, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd9] -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7f,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7f,0xca] -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X86-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x77,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vmovaps %zmm1, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd9] -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7f,0xca] -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X64-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x77,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } @@ -7193,52 +7852,47 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7e,0xca] -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X86-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x76,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7e,0xca] -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X64-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x76,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7e,0xca] -; X86-NEXT: vpaddd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7e,0xca] -; X64-NEXT: vpaddd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) @@ -9740,33 +10394,49 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovqd %zmm0, %ymm2 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x49,0x35,0xc1] -; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0xfe,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 ## encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovqd %zmm0, %ymm2 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x49,0x35,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 ## encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0xfe,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0xfe,0xc0] -; X64-NEXT: retq ## encoding: [0xc3] - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 8844c0e5e95f17..07278bde99c22b 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -2922,160 +2922,166 @@ declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind r declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) -define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { +define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p) { +; X64-LABEL: test_int_x86_avx512_vpermi2var_d_512: +; X64: # %bb.0: +; X64-NEXT: vpermt2d (%rdi), %zmm1, %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_vpermi2var_d_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpermt2d (%eax), %zmm1, %zmm0 +; X86-NEXT: retl + %x2 = load <16 x i32>, <16 x i32>* %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 -; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} -; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 -; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm1 {%k1} +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 -; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm3 {%k1} -; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 -; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm1 {%k1} +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl %x2 = load <16 x i32>, <16 x i32>* %x2p %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 - %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) - %res2 = add <16 x i32> %3, %4 - ret <16 x i32> %res2 + ret <16 x i32> %3 } declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) +define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) + ret <8 x double> %1 +} + define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X64: # %bb.0: -; X64-NEXT: vmovapd %zmm0, %zmm3 -; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vaddpd %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovapd %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X86: # %bb.0: -; X86-NEXT: vmovapd %zmm0, %zmm3 -; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vaddpd %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovapd %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) %2 = bitcast <8 x i64> %x1 to <8 x double> %3 = bitcast i8 %x3 to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2 - %5 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) - %6 = bitcast <8 x i64> %x1 to <8 x double> - %res2 = fadd <8 x double> %4, %5 - ret <8 x double> %res2 + ret <8 x double> %4 } declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) +define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) + ret <16 x float> %1 +} + define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X64: # %bb.0: -; X64-NEXT: vmovaps %zmm0, %zmm3 -; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X86: # %bb.0: -; X86-NEXT: vmovaps %zmm0, %zmm3 -; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovaps %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) %2 = bitcast <16 x i32> %x1 to <16 x float> %3 = bitcast i16 %x3 to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 - %5 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) - %6 = bitcast <16 x i32> %x1 to <16 x float> - %res2 = fadd <16 x float> %4, %5 - ret <16 x float> %res2 + ret <16 x float> %4 } declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) +define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 +} + define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 -; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 -; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 - %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) - %res2 = add <8 x i64> %3, %4 - ret <8 x i64> %res2 + ret <8 x i64> %3 } define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} -; X64-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 -; X64-NEXT: vpaddd %zmm1, %zmm2, %zmm0 +; X64-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-NEXT: vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z} -; X86-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 -; X86-NEXT: vpaddd %zmm1, %zmm2, %zmm0 +; X86-NEXT: vpermi2d (%eax), %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %x2 = load <16 x i32>, <16 x i32>* %x2p %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x1) - %res2 = add <16 x i32> %3, %4 - ret <16 x i32> %res2 + ret <16 x i32> %3 } define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovapd %zmm1, %zmm2 -; X64-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} -; X64-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm0 +; X64-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: @@ -3083,10 +3089,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: kmovw %ecx, %k1 -; X86-NEXT: vmovapd %zmm1, %zmm2 -; X86-NEXT: vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z} -; X86-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm0 +; X86-NEXT: vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %x2s = load double, double* %x2ptr %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 @@ -3094,88 +3097,73 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - %4 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x1) - %res2 = fadd <8 x double> %3, %4 - ret <8 x double> %res2 + ret <8 x double> %3 } define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X64: # %bb.0: -; X64-NEXT: vmovaps %zmm1, %zmm3 -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X64-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X86: # %bb.0: -; X86-NEXT: vmovaps %zmm1, %zmm3 -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X86-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - %4 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) - %res2 = fadd <16 x float> %3, %4 - ret <16 x float> %res2 + ret <16 x float> %3 } define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X64-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X86-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) - %res2 = add <8 x i64> %3, %4 - ret <8 x i64> %res2 + ret <8 x i64> %3 +} + +define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + ret <16 x i32> %1 } define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 -; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 -; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 - %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) - %res2 = add <16 x i32> %3, %4 - ret <16 x i32> %res2 + ret <16 x i32> %3 } declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) @@ -3573,37 +3561,53 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, ret void } +define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = trunc <8 x i64> %x0 to <8 x i32> + ret <8 x i32> %1 +} + define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X64: # %bb.0: -; X64-NEXT: vpmovqd %zmm0, %ymm2 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1} -; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X86: # %bb.0: -; X86-NEXT: vpmovqd %zmm0, %ymm2 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1} +; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: retl + %1 = trunc <8 x i64> %x0 to <8 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; X86-NEXT: retl %1 = trunc <8 x i64> %x0 to <8 x i32> - %2 = trunc <8 x i64> %x0 to <8 x i32> - %3 = bitcast i8 %x2 to <8 x i1> - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %x1 - %5 = trunc <8 x i64> %x0 to <8 x i32> - %6 = bitcast i8 %x2 to <8 x i1> - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> zeroinitializer - %res3 = add <8 x i32> %1, %4 - %res4 = add <8 x i32> %res3, %7 - ret <8 x i32> %res4 + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + ret <8 x i32> %3 } declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) @@ -3633,15 +3637,21 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_qd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} ; X64-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpmovsqd %zmm0, %ymm0 -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: @@ -3649,17 +3659,27 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} -; X86-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpmovsqd %zmm0, %ymm0 -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-NEXT: vmovdqa %ymm1, %ymm0 ; X86-NEXT: retl - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_qd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmovsqd %zmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_qd_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpmovsqd %zmm0, %ymm0 {%k1} {z} +; X86-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8) @@ -3689,15 +3709,21 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_qd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} ; X64-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpmovusqd %zmm0, %ymm0 -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: @@ -3705,17 +3731,27 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} -; X86-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpmovusqd %zmm0, %ymm0 -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_qd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmovusqd %zmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_qd_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpmovusqd %zmm0, %ymm0 {%k1} {z} ; X86-NEXT: retl - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8) @@ -5148,140 +5184,202 @@ declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) +define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + ret <8 x double> %1 +} + define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X64: # %bb.0: -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; X64-NEXT: vmovapd %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X86: # %bb.0: -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; X86-NEXT: vmovapd %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2 - %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> zeroinitializer - %7 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - %res3 = fadd <8 x double> %3, %6 - %res4 = fadd <8 x double> %res3, %7 - ret <8 x double> %res4 + ret <8 x double> %3 +} + +define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 } declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) +define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + ret <8 x i64> %1 +} + define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X64: # %bb.0: -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X86: # %bb.0: -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer - %7 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - %res3 = add <8 x i64> %3, %6 - %res4 = add <8 x i64> %res3, %7 - ret <8 x i64> %res4 + ret <8 x i64> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 } declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) +define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + ret <16 x float> %1 +} + define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X64: # %bb.0: -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; X64-NEXT: vmovaps %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X86: # %bb.0: -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; X86-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2 - %4 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x float> %4, <16 x float> zeroinitializer - %7 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - %res3 = fadd <16 x float> %3, %6 - %res4 = fadd <16 x float> %res3, %7 - ret <16 x float> %res4 + ret <16 x float> %3 +} + +define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) +define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_si_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + ret <16 x i32> %1 +} + define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X64: # %bb.0: -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X86: # %bb.0: -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - %4 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer - %7 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - %res3 = add <16 x i32> %3, %6 - %res4 = add <16 x i32> %res3, %7 - ret <16 x i32> %res4 + ret <16 x i32> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 0ca1f7be88529a..c87f04fbb841b5 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -46,62 +46,99 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) - define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { +define <64 x i8>@test_int_x86_avx512_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_512: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_512: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastb %edi, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x78,0x44,0x24,0x04] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xcf] ; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastb %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc7] -; X64-NEXT: vpbroadcastb %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd7] -; X64-NEXT: vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res2, %res3 - ret <64 x i8> %res4 - } +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_pbroadcast_b_gpr_512(i8 %x0, i64 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res +} declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) - define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { + +define <32 x i16>@test_int_x86_avx512_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_512: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_512: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %edi, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x4c,0x24,0x02] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x79,0x44,0x24,0x02] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastw %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc7] -; X64-NEXT: vpbroadcastw %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd7] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res2, %res3 - ret <32 x i16> %res4 - } + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pbroadcast_w_gpr_512(i16 %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64) @@ -283,110 +320,152 @@ define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) { declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3) { +; CHECK-LABEL: test_int_x86_avx512_palignr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xc1,0x02] +; CHECK-NEXT: # zmm0 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_palignr_512: ; X86: # %bb.0: -; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xd9,0x02] -; X86-NEXT: # zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x0f,0xd1,0x02] ; X86-NEXT: # zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02] -; X86-NEXT: # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_palignr_512: ; X64: # %bb.0: -; X64-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xd9,0x02] -; X64-NEXT: # zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x0f,0xd1,0x02] ; X64-NEXT: # zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_palignr_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_palignr_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02] +; X86-NEXT: # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_palignr_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02] ; X64-NEXT: # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufh_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw $3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xc0,0x03] +; CHECK-NEXT: # zmm0 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_512: ; X86: # %bb.0: -; X86-NEXT: vpshufhw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xd0,0x03] -; X86-NEXT: # zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshufhw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x49,0x70,0xc8,0x03] ; X86-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; X86-NEXT: vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03] -; X86-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_512: ; X64: # %bb.0: -; X64-NEXT: vpshufhw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xd0,0x03] -; X64-NEXT: # zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshufhw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x49,0x70,0xc8,0x03] ; X64-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pshufh_w_512(<32 x i16> %x0, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03] +; X86-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufh_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03] ; X64-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufl_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshuflw $3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xc0,0x03] +; CHECK-NEXT: # zmm0 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_512: ; X86: # %bb.0: -; X86-NEXT: vpshuflw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xd0,0x03] -; X86-NEXT: # zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshuflw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x70,0xc8,0x03] ; X86-NEXT: # zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; X86-NEXT: vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03] -; X86-NEXT: # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_512: ; X64: # %bb.0: -; X64-NEXT: vpshuflw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xd0,0x03] -; X64-NEXT: # zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshuflw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x70,0xc8,0x03] ; X64-NEXT: # zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03] +; X86-NEXT: # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufl_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03] ; X64-NEXT: # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) + ret <32 x i16> %res } define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { @@ -547,372 +626,457 @@ declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32) declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xd9] -; X86-NEXT: # zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x68,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xd9] -; X64-NEXT: # zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x68,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xd9] -; X86-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x60,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xd9] -; X64-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x60,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: ; X86: # %bb.0: -; X86-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xd9] -; X86-NEXT: # zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x69,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: ; X64: # %bb.0: -; X64-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xd9] -; X64-NEXT: # zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x69,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: ; X86: # %bb.0: -; X86-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xd9] -; X86-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x61,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: ; X64: # %bb.0: -; X64-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xd9] -; X64-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x61,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_b_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3c,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3c,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xee,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xee,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_b_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xde,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xde,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3e,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3e,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_b_512: ; X86: # %bb.0: -; X86-NEXT: vpminsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x38,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_b_512: ; X64: # %bb.0: -; X64-NEXT: vpminsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x38,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_w_512: ; X86: # %bb.0: -; X86-NEXT: vpminsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xea,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_w_512: ; X64: # %bb.0: -; X64-NEXT: vpminsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xea,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminub %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_b_512: ; X86: # %bb.0: -; X86-NEXT: vpminub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xda,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_b_512: ; X64: # %bb.0: -; X64-NEXT: vpminub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xda,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_w_512: ; X86: # %bb.0: -; X86-NEXT: vpminuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3a,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_w_512: ; X64: # %bb.0: -; X64-NEXT: vpminuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3a,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw %ymm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xc0] +; CHECK-NEXT: # zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xd0] -; X86-NEXT: # zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x30,0xc8] ; X86-NEXT: # zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X86-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0] -; X86-NEXT: # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xd0] -; X64-NEXT: # zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x30,0xc8] ; X64-NEXT: # zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_512(<32 x i8> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0] +; X86-NEXT: # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0] ; X64-NEXT: # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %ymm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xd0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x20,0xc8] -; X86-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x20,0xc8] -; X64-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_512(<32 x i8> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32) @@ -1097,26 +1261,31 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_512: ; X86: # %bb.0: -; X86-NEXT: vpshufb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_512: ; X64: # %bb.0: -; X64-NEXT: vpshufb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } @@ -2631,178 +2800,221 @@ define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) { declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulh_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512: ; X86: # %bb.0: -; X86-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512: ; X64: # %bb.0: -; X64-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddubs_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: ; X86: # %bb.0: -; X86-NEXT: vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: ; X64: # %bb.0: -; X64-NEXT: vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X86: # %bb.0: -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X64: # %bb.0: -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) @@ -2810,51 +3022,46 @@ declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i1 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32) @@ -3985,30 +4192,45 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] -; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + ret <32 x i8> %res } diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 613bbd6633d6f0..e84d9d37c82227 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -683,84 +683,82 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) +define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) + ret <32 x i16> %1 +} + define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1 - %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) - %res2 = add <32 x i16> %3, %4 - ret <32 x i16> %res2 + ret <32 x i16> %3 } define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) - %res2 = add <32 x i16> %3, %4 - ret <32 x i16> %res2 + ret <32 x i16> %3 } declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) +define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) + ret <32 x i16> %1 +} + define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1 - %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) - %res2 = add <32 x i16> %3, %4 - ret <32 x i16> %res2 + ret <32 x i16> %3 } declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) @@ -942,36 +940,51 @@ define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x ret <32 x i16> %res2 } +define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = trunc <32 x i16> %x0 to <32 x i8> + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] -; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <32 x i16> %x0 to <32 x i8> + %2 = bitcast i32 %x2 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 + ret <32 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = trunc <32 x i16> %x0 to <32 x i8> - %2 = trunc <32 x i16> %x0 to <32 x i8> - %3 = bitcast i32 %x2 to <32 x i1> - %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %x1 - %5 = trunc <32 x i16> %x0 to <32 x i8> - %6 = bitcast i32 %x2 to <32 x i1> - %7 = select <32 x i1> %6, <32 x i8> %5, <32 x i8> zeroinitializer - %res3 = add <32 x i8> %1, %4 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + %2 = bitcast i32 %x2 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) @@ -1000,32 +1013,47 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0xc1] -; X86-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc2] -; X86-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X86-NEXT: vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0] -; X86-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0xc1] -; X64-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc2] -; X64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X64-NEXT: vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0] -; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmovs_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + ret <32 x i8> %res } declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) @@ -1054,32 +1082,47 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovuswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0xc1] -; X86-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc2] -; X86-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X86-NEXT: vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0] -; X86-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovuswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0xc1] -; X64-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc2] -; X64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X64-NEXT: vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0] -; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmovus_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovuswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovuswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + ret <32 x i8> %res } declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) @@ -1353,36 +1396,51 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) +define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) + ret <32 x i16> %1 +} + define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X86: # %bb.0: -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X64: # %bb.0: -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2 - %4 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer - %7 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) - %res3 = add <32 x i16> %3, %6 - %res4 = add <32 x i16> %res3, %7 - ret <32 x i16> %res4 + ret <32 x i16> %3 +} + +define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 } define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) { diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 16bd8fd65cb792..1f4185ed6dc20e 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4,308 +4,469 @@ declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastb %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqu8 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0xd1] -; X86-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x78,0x44,0x24,0x04] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastb %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] -; X64-NEXT: vpbroadcastb %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7] -; X64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res2, %res3 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pbroadcast_b_gpr_128(i8 %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd1] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x79,0x44,0x24,0x02] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastw %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] -; X64-NEXT: vpbroadcastw %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res2, %res3 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) + ret <8 x i16> %res } +define <8 x i16>@test_int_x86_avx512_maskz_pbroadcast_w_gpr_128(i16 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} - declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) +declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastb %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} - define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x78,0x44,0x24,0x04] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastb %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] -; X64-NEXT: vpbroadcastb %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7] -; X64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res2, %res3 - ret <32 x i8> %res4 - } - + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) + ret <32 x i8> %res +} +define <32 x i8>@test_int_x86_avx512_maskz_pbroadcast_b_gpr_256(i8 %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) - define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +define <16 x i16>@test_int_x86_avx512_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x79,0x44,0x24,0x02] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastw %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] -; X64-NEXT: vpbroadcastw %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res2, %res3 - ret <16 x i16> %res4 - } + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pbroadcast_w_gpr_256(i16 %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32) -define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastb_256: +define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_mask_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastb_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; X86-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastb_256: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastb_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pbroadcastb_256(<16 x i8> %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastb_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) - %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res2, %res3 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16) -define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastb_128: +define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastb_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; X86-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastb_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastb_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pbroadcastb_128(<16 x i8> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastb_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastb_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) - %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res2, %res3 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16) -define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastw_256: +define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastw_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; X86-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastw_256: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastw_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pbroadcastw_256(<8 x i16> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastw_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastw_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) - %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res2, %res3 - ret <16 x i16> %res4 + %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8) -define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastw_128: +define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastw_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; X86-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastw_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastw_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pbroadcastw_128(<8 x i16> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastw_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastw_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) - %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res2, %res3 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res } declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64) -define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastb_512: +define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastb_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; X86-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastb_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastb_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_pbroadcastb_512(<16 x i8> %x0, i64 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastb_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastb_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) - %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) - %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res2, %res3 - ret <64 x i8> %res4 + %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32) -define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastw_512: +define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastw_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; X86-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastw_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastw_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pbroadcastw_512(<8 x i16> %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastw_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastw_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) - %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) - %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res2, %res3 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res } declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16) @@ -524,220 +685,306 @@ define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3) { +; CHECK-LABEL: test_int_x86_avx512_palignr_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xc1,0x02] +; CHECK-NEXT: # xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_palignr_128: ; X86: # %bb.0: -; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02] -; X86-NEXT: # xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02] ; X86-NEXT: # xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] -; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_palignr_128: ; X64: # %bb.0: -; X64-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02] -; X64-NEXT: # xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02] ; X64-NEXT: # xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_palignr_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_palignr_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] +; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_palignr_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] ; X64-NEXT: # xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3) { +; CHECK-LABEL: test_int_x86_avx512_palignr_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xc1,0x02] +; CHECK-NEXT: # ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_palignr_256: ; X86: # %bb.0: -; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02] -; X86-NEXT: # ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02] ; X86-NEXT: # ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] -; X86-NEXT: # ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_palignr_256: ; X64: # %bb.0: -; X64-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02] -; X64-NEXT: # ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02] ; X64-NEXT: # ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_palignr_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_palignr_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] +; X86-NEXT: # ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_palignr_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] ; X64-NEXT: # ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i32, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufh_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xc0,0x03] +; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_128: ; X86: # %bb.0: -; X86-NEXT: vpshufhw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03] -; X86-NEXT: # xmm2 = xmm0[0,1,2,3,7,4,4,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03] ; X86-NEXT: # xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] -; X86-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_128: ; X64: # %bb.0: -; X64-NEXT: vpshufhw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03] -; X64-NEXT: # xmm2 = xmm0[0,1,2,3,7,4,4,4] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03] ; X64-NEXT: # xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pshufh_w_128(<8 x i16> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufh_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i32, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufh_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_256: ; X86: # %bb.0: -; X86-NEXT: vpshufhw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; X86-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_256: ; X64: # %bb.0: -; X64-NEXT: vpshufhw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pshufh_w_256(<16 x i16> %x0, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufh_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i32, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufl_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xc0,0x03] +; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_128: ; X86: # %bb.0: -; X86-NEXT: vpshuflw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03] -; X86-NEXT: # xmm2 = xmm0[3,0,0,0,4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03] ; X86-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] -; X86-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_128: ; X64: # %bb.0: -; X64-NEXT: vpshuflw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03] -; X64-NEXT: # xmm2 = xmm0[3,0,0,0,4,5,6,7] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03] ; X64-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pshufl_w_128(<8 x i16> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufl_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i32, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_pshufl_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_256: ; X86: # %bb.0: -; X86-NEXT: vpshuflw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; X86-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_256: ; X64: # %bb.0: -; X64-NEXT: vpshuflw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pshufl_w_256(<16 x i16> %x0, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufl_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) + ret <16 x i16> %res } define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { @@ -1012,228 +1259,260 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8) declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9] -; X86-NEXT: # xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9] -; X64-NEXT: # xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_ask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_punpcklb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9] -; X86-NEXT: # ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9] -; X64-NEXT: # ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9] -; X86-NEXT: # xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9] -; X64-NEXT: # xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9] -; X86-NEXT: # ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9] -; X64-NEXT: # ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { @@ -2146,70 +2425,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1] -; X86-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1] -; X64-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_b_256: ; X86: # %bb.0: -; X86-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_256: ; X64: # %bb.0: -; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2219,21 +2520,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1] -; X86-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1] -; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) @@ -2243,70 +2556,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1] -; X86-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1] -; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_b_256: ; X86: # %bb.0: -; X86-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_256: ; X64: # %bb.0: -; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2316,21 +2651,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1] -; X86-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1] -; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) @@ -2340,70 +2687,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1] -; X86-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1] -; X64-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_b_256: ; X86: # %bb.0: -; X86-NEXT: vpminsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_b_256: ; X64: # %bb.0: -; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_w_128: ; X86: # %bb.0: -; X86-NEXT: vpminsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_w_128: ; X64: # %bb.0: -; X64-NEXT: vpminsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2413,21 +2782,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1] -; X86-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1] -; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) @@ -2437,70 +2818,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1] -; X86-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1] -; X64-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_b_256: ; X86: # %bb.0: -; X86-NEXT: vpminub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_b_256: ; X64: # %bb.0: -; X64-NEXT: vpminub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_w_128: ; X86: # %bb.0: -; X86-NEXT: vpminuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_w_128: ; X64: # %bb.0: -; X64-NEXT: vpminuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2510,21 +2913,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1] -; X86-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1] -; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) @@ -2895,249 +3310,349 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_128: ; X86: # %bb.0: -; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1] -; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_128: ; X64: # %bb.0: -; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1] -; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_256: ; X86: # %bb.0: -; X86-NEXT: vpshufb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_256: ; X64: # %bb.0: -; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X86-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X64-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] -; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + ret <16 x i16> %res } +define <16 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_256(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] +; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) + ret <16 x i16> %res +} declare <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8] -; X86-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8] -; X64-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8] -; X86-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8] -; X64-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_256(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) + ret <16 x i16> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8] -; X86-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_128(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8] -; X86-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8] -; X64-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res } +define <4 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_256(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res +} declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16) @@ -5805,336 +6320,420 @@ define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) { declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmulhuw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulh_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmulhw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmulhw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulh_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmulhw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128: ; X86: # %bb.0: -; X86-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128: ; X64: # %bb.0: -; X64-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256: ; X86: # %bb.0: -; X86-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256: ; X64: # %bb.0: -; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_ask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_pmaddubs_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddubs_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmaddwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1] -; X86-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmaddwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1] -; X64-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmaddwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X86: # %bb.0: -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X86-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X64: # %bb.0: -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X86: # %bb.0: -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X64: # %bb.0: -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) @@ -6142,52 +6741,47 @@ declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -6195,78 +6789,76 @@ declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i1 define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8) @@ -8795,32 +9387,50 @@ define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1 declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] -; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmov_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + ret <16 x i8> %res } diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index bec461abed8b2b..73008582aca28e 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1127,167 +1127,163 @@ define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %pt declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) +define <8 x i16>@test_int_x86_avx512_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) + ret <8 x i16> %1 +} + define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1 - %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) - %res2 = add <8 x i16> %3, %4 - ret <8 x i16> %res2 + ret <8 x i16> %3 } define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) - %res2 = add <8 x i16> %3, %4 - ret <8 x i16> %res2 + ret <8 x i16> %3 +} + +define <16 x i16>@test_int_x86_avx512_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) + ret <16 x i16> %1 } define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1 - %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) - %res2 = add <16 x i16> %3, %4 - ret <16 x i16> %res2 + ret <16 x i16> %3 } define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) - %res2 = add <16 x i16> %3, %4 - ret <16 x i16> %res2 + ret <16 x i16> %3 } declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>) +define <8 x i16>@test_int_x86_avx512_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) + ret <8 x i16> %1 +} + define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1 - %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) - %res2 = add <8 x i16> %3, %4 - ret <8 x i16> %res2 + ret <8 x i16> %3 } declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>) +define <16 x i16>@test_int_x86_avx512_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) + ret <16 x i16> %1 +} + define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1 - %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) - %res2 = add <16 x i16> %3, %4 - ret <16 x i16> %res2 + ret <16 x i16> %3 } declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) @@ -1716,38 +1712,56 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, ret void } +define <16 x i8>@test_int_x86_avx512_pmov_wb_256(<16 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = trunc <16 x i16> %x0 to <16 x i8> + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] -; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <16 x i16> %x0 to <16 x i8> + %2 = bitcast i16 %x2 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 + ret <16 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmov_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %1 = trunc <16 x i16> %x0 to <16 x i8> - %2 = trunc <16 x i16> %x0 to <16 x i8> - %3 = bitcast i16 %x2 to <16 x i1> - %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %x1 - %5 = trunc <16 x i16> %x0 to <16 x i8> - %6 = bitcast i16 %x2 to <16 x i1> - %7 = select <16 x i1> %6, <16 x i8> %5, <16 x i8> zeroinitializer - %res3 = add <16 x i8> %1, %4 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + %2 = bitcast i16 %x2 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) @@ -1776,34 +1790,52 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1] -; X86-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2] -; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X86-NEXT: vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0] -; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2] ; X64-NEXT: vpmovswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1] -; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X64-NEXT: vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0] -; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmovs_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + ret <16 x i8> %res } declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) @@ -1832,34 +1864,52 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1] -; X86-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2] -; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X86-NEXT: vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0] -; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2] ; X64-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1] -; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X64-NEXT: vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0] -; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmovus_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovuswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovuswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + ret <16 x i8> %res } declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) @@ -2360,70 +2410,100 @@ declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) +define <8 x i16>@test_int_x86_avx512_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) + ret <8 x i16> %1 +} + define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X86: # %bb.0: -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X86-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X64: # %bb.0: -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2 - %4 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> zeroinitializer - %7 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) - %res3 = add <8 x i16> %3, %6 - %res4 = add <8 x i16> %res3, %7 - ret <8 x i16> %res4 + ret <8 x i16> %3 +} + +define <8 x i16>@test_int_x86_avx512_maskz_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 } declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) +define <16 x i16>@test_int_x86_avx512_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) + ret <16 x i16> %1 +} + define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X86: # %bb.0: -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X64: # %bb.0: -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2 - %4 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> zeroinitializer - %7 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) - %res3 = add <16 x i16> %3, %6 - %res4 = add <16 x i16> %res3, %7 - ret <16 x i16> %res4 + ret <16 x i16> %3 } +define <16 x i16>@test_int_x86_avx512_maskz_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index ab0cf02b4db947..0029d6a60881b0 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -4,184 +4,277 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; X86: # %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca] -; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca] -; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vextractf64x2_512(<8 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf64x2_512: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vextractf64x2_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vextractf32x8(<16 x float> %x0, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf32x8: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vextractf32x8: ; X86: # %bb.0: -; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1b,0xc1,0x01] -; X86-NEXT: vaddps %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf4,0x58,0xca] -; X86-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] -; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x58,0xc1] +; X86-NEXT: vmovaps %ymm1, %ymm0 # encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vextractf32x8: ; X64: # %bb.0: -; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1b,0xc1,0x01] -; X64-NEXT: vaddps %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf4,0x58,0xca] -; X64-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] -; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x58,0xc1] +; X64-NEXT: vmovaps %ymm1, %ymm0 # encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_vextractf32x8(<16 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf32x8: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vextractf32x8: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res } declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf32x8_512: ; X86: # %bb.0: -; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xd1,0x01] -; X86-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf32x8_512: ; X64: # %bb.0: -; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xd1,0x01] -; X64-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf32x8_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf32x8_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf64x2_512: ; X86: # %bb.0: -; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x18,0xd1,0x01] -; X86-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf64x2_512: ; X64: # %bb.0: -; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x18,0xd1,0x01] -; X64-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_insertf64x2_512(<8 x double> %x0, <2 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf64x2_512: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf64x2_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti32x8_512: ; X86: # %bb.0: -; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xd1,0x01] -; X86-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti32x8_512: ; X64: # %bb.0: -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xd1,0x01] -; X64-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti32x8_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti32x8_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti64x2_512: ; X86: # %bb.0: -; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x38,0xd1,0x01] -; X86-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti64x2_512: ; X64: # %bb.0: -; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x38,0xd1,0x01] -; X64-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti64x2_512: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti64x2_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + ret <8 x i64> %res } @@ -223,35 +316,55 @@ define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) { declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] -; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] +; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] +; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_broadcastf32x8_512(<8 x float> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf32x8_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf32x8_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) - %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res1, %res2 - %res5 = fadd <16 x float> %res3, %res4 - ret <16 x float> %res5 + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res } define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float>* %x0ptr, <16 x float> %x2, i16 %mask) { @@ -277,37 +390,60 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float> declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] +; X86-NEXT: vmovapd %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] +; X64-NEXT: vmovapd %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x2_512(<2 x double> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) - %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res1, %res2 - %res5 = fadd <8 x double> %res3, %res4 - ret <8 x double> %res5 + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res } define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { @@ -333,35 +469,55 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xc8,0x01] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_broadcasti32x8_512(<8 x i32> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti32x8_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti32x8_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) - %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res4 = add <16 x i32> %res1, %res2 - %res5 = add <16 x i32> %res3, %res4 - ret <16 x i32> %res5 + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res } define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) { @@ -387,37 +543,60 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x2_512(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res1, %res2 - %res5 = add <8 x i64> %res3, %res4 - ret <8 x i64> %res5 + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res } define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index c2afbe09e14592..ed83a23a5f4c7a 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2388,94 +2388,142 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: ; X86: # %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; X86-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] -; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; X64-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] -; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vextractf64x2_256(<4 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf64x2_256: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vextractf64x2_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf64x2_256: ; X86: # %bb.0: -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] -; X86-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; X86-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf64x2_256: ; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] -; X64-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; X64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf64x2_256: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf64x2_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) + ret <4 x double> %res } declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti64x2_256: ; X86: # %bb.0: -; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; X86-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; X86-NEXT: vpaddq %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xd4,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti64x2_256: ; X64: # %bb.0: -; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; X64-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; X64-NEXT: vpaddq %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xd4,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti64x2_256: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti64x2_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8) @@ -2552,35 +2600,55 @@ define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) { declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xc8,0x01] -; X86-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; X86-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] -; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xc8,0x01] -; X64-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_broadcastf64x2_256(<2 x double> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] -; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) - %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask) - %res4 = fadd <4 x double> %res1, %res2 - %res5 = fadd <4 x double> %res3, %res4 - ret <4 x double> %res5 + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res } define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) { @@ -2606,35 +2674,55 @@ define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xc8,0x01] -; X86-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xc8,0x01] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_broadcasti64x2_256(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) - %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask) - %res4 = add <4 x i64> %res1, %res2 - %res5 = add <4 x i64> %res3, %res4 - ret <4 x i64> %res5 + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) { @@ -2714,98 +2802,118 @@ define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) { declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8) @@ -2872,52 +2980,64 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0) { declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i8) +define <4 x float> @test_int_x86_avx512_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>, i8) +define <4 x float> @test_int_x86_avx512_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 431d6f9d28f003..169ce93170bf6d 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -4,122 +4,147 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] -; X86-NEXT: vcvtpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] -; X64-NEXT: vcvtpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] -; X86-NEXT: vcvtpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] -; X64-NEXT: vcvtpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] -; X86-NEXT: vcvtpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] -; X64-NEXT: vcvtpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] -; X86-NEXT: vcvtpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] -; X64-NEXT: vcvtpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] -; X86-NEXT: vcvtps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] -; X64-NEXT: vcvtps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvt_ps2qq_128_load(<2 x float>* %p) { @@ -287,50 +312,60 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2qq_128_load_3(<4 x float>* %p declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] -; X86-NEXT: vcvtps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] -; X64-NEXT: vcvtps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] -; X86-NEXT: vcvtps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] -; X64-NEXT: vcvtps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvt_ps2uqq_128_load(<2 x float>* %p) { @@ -497,50 +532,70 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2uqq_128_load_3(<4 x float>* % declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] -; X86-NEXT: vcvtps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] -; X64-NEXT: vcvtps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_ask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_ask_cvt_qq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X86-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X64-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2ps_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 } define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { @@ -548,170 +603,197 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X86-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X64-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> - %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> - %res4 = fadd <4 x float> %res1, %res3 - ret <4 x float> %res4 + ret <4 x float> %res1 +} + + +define <4 x float>@test_int_x86_avx512_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %cvt = sitofp <4 x i64> %x0 to <4 x float> + ret <4 x float> %cvt } define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %cvt1 = sitofp <4 x i64> %x0 to <4 x float> %1 = bitcast i8 %x2 to <8 x i1> %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> %2 = select <4 x i1> %extract, <4 x float> %cvt1, <4 x float> %x1 - %cvt = sitofp <4 x i64> %x0 to <4 x float> - %res2 = fadd <4 x float> %2, %cvt - ret <4 x float> %res2 + ret <4 x float> %2 } declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] -; X86-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] -; X64-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] -; X86-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] -; X64-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] -; X86-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] -; X64-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] -; X86-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] -; X64-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] -; X86-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] -; X64-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvtt_ps2qq_128_load(<2 x float>* %p) { @@ -879,50 +961,70 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3(<4 x float>* % declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] -; X86-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] -; X64-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X86-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X64-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 } define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { @@ -930,76 +1032,82 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, < ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X86-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X64-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> - %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> - %res4 = fadd <4 x float> %res1, %res3 - ret <4 x float> %res4 + ret <4 x float> %res1 } declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_uqq2ps_256(<4 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %cvt = uitofp <4 x i64> %x0 to <4 x float> + ret <4 x float> %cvt +} + define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %cvt1 = uitofp <4 x i64> %x0 to <4 x float> + %cvt = uitofp <4 x i64> %x0 to <4 x float> %1 = bitcast i8 %x2 to <8 x i1> %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> - %2 = select <4 x i1> %extract, <4 x float> %cvt1, <4 x float> %x1 - %cvt = uitofp <4 x i64> %x0 to <4 x float> - %res2 = fadd <4 x float> %2, %cvt - ret <4 x float> %res2 + %2 = select <4 x i1> %extract, <4 x float> %cvt, <4 x float> %x1 + ret <4 x float> %2 } declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] -; X86-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] -; X64-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvtt_ps2uqq_128_load(<2 x float>* %p) { @@ -1167,26 +1275,31 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3(<4 x float>* declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] -; X86-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] -; X64-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x double>, i8) diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll index 8c6b982fe7475a..746e3858333e6c 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll @@ -4,130 +4,150 @@ declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X86: # %bb.0: -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X64: # %bb.0: -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X86-NEXT: vmovdqa64 %zmm1, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe1] -; X86-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xe2] -; X86-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X64-NEXT: vmovdqa64 %zmm1, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe1] -; X64-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xe2] -; X64-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %res, <64 x i8> %x2, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X86-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X64-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll index 23a7e2ac3e9659..0df1a1dd75f3fc 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll @@ -4,142 +4,160 @@ declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) +define <64 x i8>@test_int_x86_avx512_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) + ret <64 x i8> %1 +} + define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X86: # %bb.0: -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X64: # %bb.0: -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 - %4 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 +} + +define <64 x i8>@test_int_x86_avx512_maskz_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 } declare <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8>, <64 x i8>) +define <64 x i8>@test_int_x86_avx512_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + ret <64 x i8> %1 +} + define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 - %4 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 +} + +define <64 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 } declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) +define <64 x i8>@test_int_x86_avx512_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) + ret <64 x i8> %1 +} + define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X86-NEXT: vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X64-NEXT: vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1 - %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %1, <64 x i8> %x2) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 +} + +define <64 x i8>@test_int_x86_avx512_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) + ret <64 x i8> %1 } define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X86-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X64-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1 - %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> zeroinitializer, <64 x i8> %x0, <64 x i8> %x2) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 } define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll index 4b1d51da46fbe6..57479666506abc 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll @@ -4,258 +4,298 @@ declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X86: # %bb.0: -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X64: # %bb.0: -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X86: # %bb.0: -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X64: # %bb.0: -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X86-NEXT: vmovdqa %xmm1, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe1] -; X86-NEXT: vpermi2b %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xe2] -; X86-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X64-NEXT: vmovdqa %xmm1, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe1] -; X64-NEXT: vpermi2b %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xe2] -; X64-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %res, <16 x i8> %x2, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X86-NEXT: vmovdqa %ymm1, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe1] -; X86-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xe2] -; X86-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X64-NEXT: vmovdqa %ymm1, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe1] -; X64-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xe2] -; X64-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %res, <32 x i8> %x2, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X86-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X64-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X86-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X64-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll index 7c03d78c825f32..748c35a3afcb9c 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll @@ -4,282 +4,318 @@ declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) +define <16 x i8>@test_int_x86_avx512_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X86: # %bb.0: -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X64: # %bb.0: -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 - %4 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_maskz_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) +define <32 x i8>@test_int_x86_avx512_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X86: # %bb.0: -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X64: # %bb.0: -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 - %4 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_maskz_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } declare <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8>, <16 x i8>) +define <16 x i8>@test_int_x86_avx512_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 - %4 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } declare <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8>, <32 x i8>) +define <32 x i8>@test_int_x86_avx512_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 - %4 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>) +define <16 x i8>@test_int_x86_avx512_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X86-NEXT: vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X64-NEXT: vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 - %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %1, <16 x i8> %x2) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 } declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>) +define <32 x i8>@test_int_x86_avx512_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X86-NEXT: vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X64-NEXT: vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 - %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %1, <32 x i8> %x2) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) + ret <16 x i8> %1 } define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X86-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X64-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 - %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> zeroinitializer, <16 x i8> %x0, <16 x i8> %x2) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) + ret <32 x i8> %1 } define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X86-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X64-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 - %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> zeroinitializer, <32 x i8> %x0, <32 x i8> %x2) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 } define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 9ea5000f19bd4d..3e9ad09b8d5873 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4,375 +4,573 @@ declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd1] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0x44,0x24,0x01] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastd %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] -; X64-NEXT: vpbroadcastd %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xd7] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res2, %res3 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) + ret <4 x i32> %res } +define <4 x i32>@test_int_x86_avx512_maskz_pbroadcast_d_gpr_128(i32 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0x44,0x24,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vmovddup {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x44,0x24,0x04] +; X86-NEXT: # xmm0 = mem[0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastq %rdi, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqa64 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xd1] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0x84,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %rdi, %xmm1 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] -; X64-NEXT: vpbroadcastq %rdi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xd7] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res2, %res3 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pbroadcast_q_gpr_128(i64 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) + ret <2 x i64> %res } +declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) - declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} - define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { +define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqa32 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0x44,0x24,0x01] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastd %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] -; X64-NEXT: vpbroadcastd %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xd7] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 - } + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pbroadcast_d_gpr_256(i32 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0x44,0x24,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) + ret <8 x i32> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) - declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastq %rdi, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) + ret <4 x i64> %res +} - define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { +define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqa64 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0x84,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %rdi, %ymm1 # encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] -; X64-NEXT: vpbroadcastq %rdi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xd7] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res2, %res3 - ret <4 x i64> %res4 - } - + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) + ret <4 x i64> %res +} +define <4 x i64>@test_int_x86_avx512_maskz_pbroadcast_q_gpr_256(i64 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) + ret <4 x i64> %res +} declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8) -define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) { +define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i32 * %y_ptr) { ; X86-LABEL: test_int_x86_avx512_pbroadcastd_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x04] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] -; X86-NEXT: vpaddd (%eax){1to8}, %ymm1, %ymm1 # encoding: [0x62,0xf1,0x75,0x38,0xfe,0x08] -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vbroadcastss (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] -; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] -; X64-NEXT: vpaddd (%rsi){1to8}, %ymm1, %ymm1 # encoding: [0x62,0xf1,0x75,0x38,0xfe,0x0e] -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; X64-NEXT: vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07] ; X64-NEXT: retq # encoding: [0xc3] %y_32 = load i32, i32 * %y_ptr %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %y, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) - %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 + ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8) - -define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastd_128: +define <8 x i32>@test_int_x86_avx512_mask_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastd_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) - %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res2, %res3 - ret <4 x i32> %res4 + %y_32 = load i32, i32 * %y_ptr + %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 + %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) + ret <8 x i32> %res } -declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastq_256: +define <8 x i32>@test_int_x86_avx512_maskz_pbroadcastd_256(<4 x i32> %x0, i8 %mask, i32 * %y_ptr) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastq_256: +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) - %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask) - %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res2, %res3 - ret <4 x i64> %res4 + %y_32 = load i32, i32 * %y_ptr + %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 + %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + ret <8 x i32> %res } -declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8) +declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8) -define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastq_128: +define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X86-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastq_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X64-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) - %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask) - %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res2, %res3 - ret <2 x i64> %res4 + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) + ret <4 x i32> %res } -declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly - -define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_sd_pd_256: +define <4 x i32>@test_int_x86_avx512_maskz_pbroadcastd_128(<4 x i32> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_128: ; X86: # %bb.0: -; X86-NEXT: vbroadcastsd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; X86-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] -; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_sd_pd_256: +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_128: ; X64: # %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; X64-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] -; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) - %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) - %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } -declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly +declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) + ret <4 x i64> %res +} -define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_ss_ps_256: +define <4 x i64>@test_int_x86_avx512_mask_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_256: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; X86-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] -; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; X86-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_ss_ps_256: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_256: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; X64-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] -; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; X64-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask) + ret <4 x i64> %res } -declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly - -define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_ss_ps_128: +define <4 x i64>@test_int_x86_avx512_maskz_pbroadcastq_256(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_256: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; X86-NEXT: vaddps %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] -; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_ss_ps_128: +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_256: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; X64-NEXT: vaddps %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] -; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] -; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) - %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) - %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask) + ret <4 x i64> %res } -declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) +declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8) -define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { -; X86-LABEL: test_int_x86_avx512_mask_movsldup_128: +define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_mask_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_128: ; X86: # %bb.0: -; X86-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] -; X86-NEXT: # xmm2 = xmm0[0,0,2,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] -; X86-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; X86-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0,2,2] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_movsldup_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_128: ; X64: # %bb.0: -; X64-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] -; X64-NEXT: # xmm2 = xmm0[0,0,2,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] -; X64-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X64-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pbroadcastq_128(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask) + ret <2 x i64> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly + +define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> undef, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_x86_mask_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_sd_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_mask_vbroadcast_sd_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_x86_maskz_vbroadcast_sd_pd_256(<2 x double> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_sd_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_sd_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly + +define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_x86_mask_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_ss_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_mask_vbroadcast_ss_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_x86_maskz_vbroadcast_ss_ps_256(<4 x float> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_ss_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_ss_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly + +define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> undef, i8 -1) + ret <4 x float> %res +} + + +define <4 x float> @test_x86_mask_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_ss_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_mask_vbroadcast_ss_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_x86_maskz_vbroadcast_ss_ps_128(<4 x float> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_ss_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_ss_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_movsldup_128: +; X86: # %bb.0: +; X86-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] +; X86-NEXT: # xmm2 = xmm0[0,0,2,2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] +; X86-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] +; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X86-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0,2,2] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_movsldup_128: +; X64: # %bb.0: +; X64-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] +; X64-NEXT: # xmm2 = xmm0[0,0,2,2] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] +; X64-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] +; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; X64-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0,0,2,2] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] @@ -497,298 +695,419 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f } declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_movddup_128(<2 x double> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_movddup_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_movddup_128: ; X86: # %bb.0: -; X86-NEXT: vmovddup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0] -; X86-NEXT: # xmm2 = xmm0[0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovddup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0,0] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; X86-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_movddup_128: ; X64: # %bb.0: -; X64-NEXT: vmovddup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0] -; X64-NEXT: # xmm2 = xmm0[0,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0,0] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_movddup_128(<2 x double> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_movddup_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_movddup_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0,0] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) - %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_movddup_256(<4 x double> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_movddup_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xc0] +; CHECK-NEXT: # ymm0 = ymm0[0,0,2,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + + define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_movddup_256: ; X86: # %bb.0: -; X86-NEXT: vmovddup %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0] -; X86-NEXT: # ymm2 = ymm0[0,0,2,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovddup %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,0,2,2] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] -; X86-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,0,2,2] -; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_movddup_256: ; X64: # %bb.0: -; X64-NEXT: vmovddup %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0] -; X64-NEXT: # ymm2 = ymm0[0,0,2,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,0,2,2] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_movddup_256(<4 x double> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_movddup_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,0,2,2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_movddup_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,0,2,2] -; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) + ret <4 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x06] +; CHECK-NEXT: # ymm0 = ymm0[0,1,3,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; X86: # %bb.0: -; X86-NEXT: vpermilpd $6, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06] -; X86-NEXT: # ymm2 = ymm0[0,1,3,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,1,3,2] -; X86-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] -; X86-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; X64: # %bb.0: -; X64-NEXT: vpermilpd $6, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06] -; X64-NEXT: # ymm2 = ymm0[0,1,3,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,1,3,2] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_vpermil_pd_256(<4 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] -; X64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; CHECK-NEXT: # xmm0 = xmm0[1,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; X86: # %bb.0: -; X86-NEXT: vpermilpd $1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01] -; X86-NEXT: # xmm2 = xmm0[1,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] ; X86-NEXT: # xmm1 {%k1} = xmm0[1,0] -; X86-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] -; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; X86-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; X64: # %bb.0: -; X64-NEXT: vpermilpd $1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01] -; X64-NEXT: # xmm2 = xmm0[1,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] ; X64-NEXT: # xmm1 {%k1} = xmm0[1,0] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vpermil_pd_128(<2 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_pd_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] -; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; X64-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 + %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x16] +; CHECK-NEXT: # ymm0 = ymm0[2,1,1,0,6,5,5,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; X86: # %bb.0: -; X86-NEXT: vpermilps $22, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16] -; X86-NEXT: # ymm2 = ymm0[2,1,1,0,6,5,5,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] ; X86-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] -; X86-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] -; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; X86-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; X64: # %bb.0: -; X64-NEXT: vpermilps $22, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16] -; X64-NEXT: # ymm2 = ymm0[2,1,1,0,6,5,5,4] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] ; X64-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_vpermil_ps_256(<8 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] -; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; X64-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 + %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x16] +; CHECK-NEXT: # xmm0 = xmm0[2,1,1,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; X86: # %bb.0: -; X86-NEXT: vpermilps $22, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16] -; X86-NEXT: # xmm2 = xmm0[2,1,1,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] ; X86-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0] -; X86-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; X64: # %bb.0: -; X64-NEXT: vpermilps $22, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16] -; X64-NEXT: # xmm2 = xmm0[2,1,1,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] ; X64-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_maskz_vpermil_ps_128(<4 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res } declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_df_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_df_256: ; X86: # %bb.0: -; X86-NEXT: vpermpd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] -; X86-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X86-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X86-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_df_256: ; X64: # %bb.0: -; X64-NEXT: vpermpd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_perm_df_256(<4 x double> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_df_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_df_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res3, %res2 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_di_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_di_256: ; X86: # %bb.0: -; X86-NEXT: vpermq $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] -; X86-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_di_256: ; X64: # %bb.0: -; X64-NEXT: vpermq $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_perm_di_256(<4 x i64> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_di_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_di_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) + ret <4 x i64> %res } declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8) @@ -1661,76 +1980,106 @@ define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %ma declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i32, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x03] +; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_128: ; X86: # %bb.0: -; X86-NEXT: vpshufd $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03] -; X86-NEXT: # xmm2 = xmm0[3,0,0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03] ; X86-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0] -; X86-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_128: ; X64: # %bb.0: -; X64-NEXT: vpshufd $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03] -; X64-NEXT: # xmm2 = xmm0[3,0,0,0] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03] ; X64-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pshuf_d_128(<4 x i32> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshuf_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshuf_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_256: ; X86: # %bb.0: -; X86-NEXT: vpshufd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0,7,4,4,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_256: ; X64: # %bb.0: -; X64-NEXT: vpshufd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0,7,4,4,4] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pshuf_d_256(<8 x i32> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshuf_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshuf_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) + ret <8 x i32> %res } define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { @@ -2017,466 +2366,530 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8) declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: ; X86: # %bb.0: -; X86-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9] -; X86-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: ; X64: # %bb.0: -; X64-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9] -; X64-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_256: ; X86: # %bb.0: -; X86-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9] -; X86-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_256: ; X64: # %bb.0: -; X64-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9] -; X64-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_128: ; X86: # %bb.0: -; X86-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9] -; X86-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_128: ; X64: # %bb.0: -; X64-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9] -; X64-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; X86: # %bb.0: -; X86-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9] -; X86-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; X64: # %bb.0: -; X64-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9] -; X64-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_128: ; X86: # %bb.0: -; X86-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_128: ; X64: # %bb.0: -; X64-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_256: ; X86: # %bb.0: -; X86-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_256: ; X64: # %bb.0: -; X64-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_128: ; X86: # %bb.0: -; X86-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_128: ; X64: # %bb.0: -; X64-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_256: ; X86: # %bb.0: -; X86-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_256: ; X64: # %bb.0: -; X64-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_ask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_punpckhd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9] -; X86-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9] -; X64-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckld_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9] -; X86-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9] -; X64-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckld_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9] -; X86-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9] -; X64-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9] -; X86-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9] -; X64-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { @@ -5159,6 +5572,16 @@ declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, < declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: ; X86: # %bb.0: @@ -5166,10 +5589,7 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x ; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovaps %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0] -; X86-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8] -; X86-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] +; X86-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: @@ -5177,21 +5597,45 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x ; X64-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] ; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovaps %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0] -; X64-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8] -; X64-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] +; X64-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_f32x4_256: +; X86: # %bb.0: +; X86-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_f32x4_256: +; X64: # %bb.0: +; X64-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4) + ret <8 x float> %res } declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: ; X86: # %bb.0: @@ -5199,10 +5643,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 ; X86-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovapd %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x28,0xd0] -; X86-NEXT: vmovapd %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc8] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; X86-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: @@ -5210,21 +5651,45 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 ; X64-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] ; X64-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovapd %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x28,0xd0] -; X64-NEXT: vmovapd %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc8] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; X64-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x65,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_f64x2_256: +; X86: # %bb.0: +; X86-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] +; X86-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_f64x2_256: +; X64: # %bb.0: +; X64-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] +; X64-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4) + ret <4 x double> %res } declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: ; X86: # %bb.0: @@ -5232,8 +5697,7 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32 ; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xd0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x64,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: @@ -5241,17 +5705,24 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32 ; X64-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xd0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x64,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) + ret <8 x i32> %res } declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: ; X86: # %bb.0: @@ -5259,8 +5730,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xd0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x64,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: @@ -5268,137 +5738,161 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ; X64-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xd0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x64,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) + ret <4 x i64> %res } declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_shuf_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc1,0x01] +; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_128: ; X86: # %bb.0: -; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01] -; X86-NEXT: # xmm3 = xmm0[1],xmm1[0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; X86-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[0] -; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] -; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; X86-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_128: ; X64: # %bb.0: -; X64-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01] -; X64-NEXT: # xmm3 = xmm0[1],xmm1[0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; X64-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[0] -; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_pd_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1],xmm1[0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_pd_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; X64-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) - %res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> zeroinitializer, i8 %x4) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 + %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> zeroinitializer, i8 %x4) + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc1,0x06] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_256: ; X86: # %bb.0: -; X86-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x06] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_256: ; X64: # %bb.0: -; X64-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x06] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x16] +; CHECK-NEXT: # xmm0 = xmm0[2,1],xmm1[1,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_128: ; X86: # %bb.0: -; X86-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16] -; X86-NEXT: # xmm3 = xmm0[2,1],xmm1[1,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16] ; X86-NEXT: # xmm2 {%k1} = xmm0[2,1],xmm1[1,0] -; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_128: ; X64: # %bb.0: -; X64-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16] -; X64-NEXT: # xmm3 = xmm0[2,1],xmm1[1,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16] ; X64-NEXT: # xmm2 {%k1} = xmm0[2,1],xmm1[1,0] -; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc1,0x16] +; CHECK-NEXT: # ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_256: ; X86: # %bb.0: -; X86-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16] -; X86-NEXT: # ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16] ; X86-NEXT: # ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_256: ; X64: # %bb.0: -; X64-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16] -; X64-NEXT: # ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16] ; X64-NEXT: # ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5409,71 +5903,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1] -; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1] -; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5484,21 +6001,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1] -; X86-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1] -; X64-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5509,71 +6039,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1] -; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1] -; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +define <4 x i32>@test_int_x86_avx512_maskz_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } -declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) -define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_128: -; X86: # %bb.0: -; X86-NEXT: vpmaxuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xd9] +define <8 x i32>@test_int_x86_avx512_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + ret <8 x i32> %res +} + +declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_128: +; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5584,21 +6137,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1] -; X86-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1] -; X64-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5609,71 +6175,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1] -; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1] -; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_d_256: ; X86: # %bb.0: -; X86-NEXT: vpminsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_d_256: ; X64: # %bb.0: -; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_q_128: ; X86: # %bb.0: -; X86-NEXT: vpminsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x39,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_q_128: ; X64: # %bb.0: -; X64-NEXT: vpminsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5684,21 +6273,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1] -; X86-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1] -; X64-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5709,71 +6311,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1] -; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1] -; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_d_256: ; X86: # %bb.0: -; X86-NEXT: vpminud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_d_256: ; X64: # %bb.0: -; X64-NEXT: vpminud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_q_128: ; X86: # %bb.0: -; X86-NEXT: vpminuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_q_128: ; X64: # %bb.0: -; X64-NEXT: vpminuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5784,21 +6409,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1] -; X86-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1] -; X64-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) @@ -6600,620 +7238,900 @@ define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1 declare <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] -; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + ret <8 x i32> %res } -declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8) - -define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { -; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +define <8 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] -; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] -; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] -; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] +; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] +; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] +; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] +; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero -; X86-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_128(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_256(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + + define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + + +define <4 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X86-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8] -; X86-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8] -; X64-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8] -; X86-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8] -; X64-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_ask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_ask_pmovsxb_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8] -; X86-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8] -; X64-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8] -; X86-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8] -; X64-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8] -; X86-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8] -; X64-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8] -; X86-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8] -; X64-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: +; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] -; X64-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res } -declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { -; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: +define <4 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] ; X86-NEXT: vpmovsxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xc0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] ; X64-NEXT: vpmovsxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xc0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) @@ -7426,445 +8344,607 @@ define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> % declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2pd %xmm0, %ymm2 # encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2pd %xmm0, %ymm2 # encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xc1,0x08] +; CHECK-NEXT: # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_d_128: ; X86: # %bb.0: -; X86-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X86-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; X86-NEXT: # xmm2 {%k1} = xmm1[2,3],xmm0[0,1] -; X86-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] -; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_d_128: ; X64: # %bb.0: -; X64-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X64-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; X64-NEXT: # xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_valign_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_valign_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] +; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_valign_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; X64-NEXT: # xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> zeroinitializer,i8 %x4) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> zeroinitializer,i8 %x4) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xc1,0x03] +; CHECK-NEXT: # ymm0 = ymm1[3],ymm0[0,1,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_d_256: ; X86: # %bb.0: -; X86-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X86-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; X86-NEXT: # ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_d_256: ; X64: # %bb.0: -; X64-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X64-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; X64-NEXT: # ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xc1,0x08] +; CHECK-NEXT: # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_q_128: ; X86: # %bb.0: -; X86-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X86-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01] ; X86-NEXT: # xmm2 {%k1} = xmm1[1],xmm0[0] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_q_128: ; X64: # %bb.0: -; X64-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X64-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01] ; X64-NEXT: # xmm2 {%k1} = xmm1[1],xmm0[0] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 %x4) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xc1,0x03] +; CHECK-NEXT: # ymm0 = ymm1[3],ymm0[0,1,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_q_256: ; X86: # %bb.0: -; X86-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X86-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignq $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x03] ; X86-NEXT: # ymm2 {%k1} = ymm1[3],ymm0[0,1,2] -; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_q_256: ; X64: # %bb.0: -; X64-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X64-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignq $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x03] ; X64-NEXT: # ymm2 {%k1} = ymm1[3],ymm0[0,1,2] -; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256: ; X86: # %bb.0: -; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1] -; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256: ; X64: # %bb.0: -; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1] -; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128: ; X86: # %bb.0: -; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1] -; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] -; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; X86-NEXT: vaddpd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128: ; X64: # %bb.0: -; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1] -; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] -; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256: ; X86: # %bb.0: -; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1] -; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] -; X86-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X86-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256: ; X64: # %bb.0: -; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1] -; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] -; X64-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X64-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128: ; X86: # %bb.0: -; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1] -; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] -; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128: ; X64: # %bb.0: -; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1] -; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] -; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) -define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +define <4 x float>@test_int_x86_avx512_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_maskz_vextractf32x4_256(<8 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf32x4_256: ; X86: # %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] ; X86-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; X64-LABEL: test_int_x86_avx512_maskz_vextractf32x4_256: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] ; X64-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf32x4_256: ; X86: # %bb.0: -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] -; X86-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; X86-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf32x4_256: ; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] -; X64-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; X64-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf32x4_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf32x4_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) + ret <8 x float> %res } declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti32x4_256: ; X86: # %bb.0: -; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti32x4_256: ; X64: # %bb.0: -; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; X64-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti32x4_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti32x4_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) + ret <8 x i32> %res } define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { @@ -9011,35 +10091,54 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounw declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] -; X86-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] -; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] -; X64-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_broadcastf32x4_256(<4 x float> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf32x4_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf32x4_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] -; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) - %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) - %res4 = fadd <8 x float> %res1, %res2 - %res5 = fadd <8 x float> %res3, %res4 - ret <8 x float> %res5 + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res } define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* %x0ptr, <8 x float> %x2, i8 %mask) { @@ -9065,35 +10164,54 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xc8,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xc8,0x01] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_broadcasti32x4_256(<4 x i32> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti32x4_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti32x4_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) - %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) - %res4 = add <8 x i32> %res1, %res2 - %res5 = add <8 x i32> %res3, %res4 - ret <8 x i32> %res5 + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + ret <8 x i32> %res } define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(<4 x i32>* %x0ptr, <8 x i32> %x2, i8 %mask) { @@ -10199,52 +11317,62 @@ declare < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32>, < 8 x i32>, < declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2ps %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2ps %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2ps %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; X86-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2ps %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; X64-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) - %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { @@ -10343,307 +11471,409 @@ declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvtpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvtpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtpd2ps %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtpd2ps %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtps2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtps2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtps2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtps2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_permvar_sf_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_256: ; X86: # %bb.0: -; X86-NEXT: vpermps %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] -; X86-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X86-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_256: +; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_si_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_permvar_si_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_permvar_si_256: ; X64: # %bb.0: -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] -; X64-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X64-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] +; X64-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + ret <8 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_permvar_si_256: +define <8 x i32>@test_int_x86_avx512_maskz_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_256: ; X86: # %bb.0: -; X86-NEXT: vpermd %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] ; X86-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x36,0xc0] -; X86-NEXT: vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_permvar_si_256: +; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_256: ; X64: # %bb.0: -; X64-NEXT: vpermd %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] ; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x36,0xc0] -; X64-NEXT: vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) + ret <8 x i32> %res } declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X86: # %bb.0: -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X64: # %bb.0: -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res3, %res2 - ret <4 x double> %res4 + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X86: # %bb.0: -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X86-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X64: # %bb.0: -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X64-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) @@ -10864,106 +12094,122 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2ps %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2ps %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2ps %ymm0, %ymm2 # encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] -; X86-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2ps %ymm0, %ymm2 # encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] -; X64-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) - %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -10971,80 +12217,78 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) @@ -11052,107 +12296,108 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X86: # %bb.0: -; X86-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X86-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X64: # %bb.0: -; X64-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X64-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X86: # %bb.0: -; X86-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X86-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X64: # %bb.0: -; X64-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X64-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X86: # %bb.0: -; X86-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X86-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X64: # %bb.0: -; X64-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X64-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) { @@ -11177,83 +12422,92 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> % declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X86: # %bb.0: -; X86-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X86-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X64-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) @@ -11261,80 +12515,78 @@ declare <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64>, <2 x i64>, define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -11342,26 +12594,18 @@ declare <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64>, <4 x i64>, define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } define void @test_mask_compress_store_pd_128(i8* %addr, <2 x double> %data, i8 %mask) { @@ -14612,35 +15856,54 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X86: # %bb.0: -; X86-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] -; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X64: # %bb.0: -; X64-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 05124592f22d84..2d2f88db3d93cb 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1046,267 +1046,269 @@ declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float> declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) +define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 - %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) - %res2 = add <4 x i32> %3, %4 - ret <4 x i32> %res2 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + ret <4 x i32> %1 } define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 - %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) - %res2 = add <4 x i32> %3, %4 - ret <4 x i32> %res2 + ret <4 x i32> %3 } define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer - %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) - %res2 = add <4 x i32> %3, %4 - ret <4 x i32> %res2 + ret <4 x i32> %3 } declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) +define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 - %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) - %res2 = add <8 x i32> %3, %4 - ret <8 x i32> %res2 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_vpermt2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + ret <8 x i32> %1 } define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 - %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) - %res2 = add <8 x i32> %3, %4 - ret <8 x i32> %res2 + ret <8 x i32> %3 } define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer - %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) - %res2 = add <8 x i32> %3, %4 - ret <8 x i32> %res2 + ret <8 x i32> %3 } declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) +define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + ret <2 x double> %1 +} + define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X86: # %bb.0: -; X86-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X86-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X64: # %bb.0: -; X64-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X64-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) %2 = bitcast <2 x i64> %x1 to <2 x double> %3 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> %4 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %2 - %5 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) - %6 = bitcast <2 x i64> %x1 to <2 x double> - %res2 = fadd <2 x double> %4, %5 - ret <2 x double> %res2 + ret <2 x double> %4 } declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) +define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + ret <4 x double> %1 +} + define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X86: # %bb.0: -; X86-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X86-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X64: # %bb.0: -; X64-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X64-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) %2 = bitcast <4 x i64> %x1 to <4 x double> %3 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> %4 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %2 - %5 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) - %6 = bitcast <4 x i64> %x1 to <4 x double> - %res2 = fadd <4 x double> %4, %5 - ret <4 x double> %res2 + ret <4 x double> %4 } declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) +define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + ret <4 x float> %1 +} + define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X86: # %bb.0: -; X86-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X86-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X64: # %bb.0: -; X64-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X64-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) %2 = bitcast <4 x i32> %x1 to <4 x float> %3 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2 - %5 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) - %6 = bitcast <4 x i32> %x1 to <4 x float> - %res2 = fadd <4 x float> %4, %5 - ret <4 x float> %res2 + ret <4 x float> %4 } define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) { @@ -1335,205 +1337,203 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> % declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) +define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + ret <8 x float> %1 +} + define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X86: # %bb.0: -; X86-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X86-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X64-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) %2 = bitcast <8 x i32> %x1 to <8 x float> %3 = bitcast i8 %x3 to <8 x i1> %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 - %5 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) - %6 = bitcast <8 x i32> %x1 to <8 x float> - %res2 = fadd <8 x float> %4, %5 - ret <8 x float> %res2 + ret <8 x float> %4 } declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) +define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + ret <2 x i64> %1 +} + define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 - %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) - %res2 = add <2 x i64> %3, %4 - ret <2 x i64> %res2 + ret <2 x i64> %3 +} + +define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + ret <2 x i64> %1 } define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 - %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) - %res2 = add <2 x i64> %3, %4 - ret <2 x i64> %res2 + ret <2 x i64> %3 } define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer - %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) - %res2 = add <2 x i64> %3, %4 - ret <2 x i64> %res2 + ret <2 x i64> %3 } declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) +define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + ret <4 x i64> %1 +} + define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 - %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) - %res2 = add <4 x i64> %3, %4 - ret <4 x i64> %res2 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + ret <4 x i64> %1 } define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 - %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) - %res2 = add <4 x i64> %3, %4 - ret <4 x i64> %res2 + ret <4 x i64> %3 } define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer - %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) - %res2 = add <4 x i64> %3, %4 - ret <4 x i64> %res2 + ret <4 x i64> %3 } declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) @@ -2470,41 +2470,60 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, ret void } +define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = trunc <4 x i64> %x0 to <4 x i32> + ret <4 x i32> %1 +} + define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X86: # %bb.0: -; X86-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] -; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X64: # %bb.0: -; X64-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <4 x i64> %x0 to <4 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %1 = trunc <4 x i64> %x0 to <4 x i32> - %2 = trunc <4 x i64> %x0 to <4 x i32> - %3 = bitcast i8 %x2 to <8 x i1> - %extract1 = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> - %4 = select <4 x i1> %extract1, <4 x i32> %2, <4 x i32> %x1 - %5 = trunc <4 x i64> %x0 to <4 x i32> - %6 = bitcast i8 %x2 to <8 x i1> - %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> - %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer - %res3 = add <4 x i32> %1, %4 - %res4 = add <4 x i32> %res3, %7 - ret <4 x i32> %res4 + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + ret <4 x i32> %3 } declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8) @@ -2534,35 +2553,54 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1] -; X86-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X86-NEXT: vpmovsqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] -; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2] ; X64-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X64-NEXT: vpmovsqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovs_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8) @@ -2592,35 +2630,54 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1] -; X86-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X86-NEXT: vpmovusqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] -; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2] ; X64-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X64-NEXT: vpmovusqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovus_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovusqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovusqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8) @@ -3345,54 +3402,74 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i ret <4 x i32> %res2 } +define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2dq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128_zext: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2ps_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 } define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1, i8 %x2) { @@ -3401,48 +3478,58 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> - %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) - %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> - %res4 = fadd <4 x float> %res1, %res3 - ret <4 x float> %res4 + ret <4 x float> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2udq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 } define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { @@ -3451,35 +3538,39 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] -; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3487,139 +3578,171 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] -; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; X86-NEXT: vcvtps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; X64-NEXT: vcvtps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; X86-NEXT: vcvtps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; X64-NEXT: vcvtps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] -; X86-NEXT: vcvtps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] -; X64-NEXT: vcvtps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] -; X86-NEXT: vcvtps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] -; X64-NEXT: vcvtps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_ask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_ask_cvtt_pd2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2dq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 } define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { @@ -3628,48 +3751,58 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2udq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 } define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { @@ -3678,35 +3811,39 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] -; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3714,64 +3851,71 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] -; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] -; X86-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] -; X64-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] -; X86-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] -; X64-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8) @@ -4652,76 +4796,108 @@ declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) +define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + ret <4 x double> %1 +} + define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X86: # %bb.0: -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X64: # %bb.0: -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> %x2 - %4 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> - %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> zeroinitializer - %7 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) - %res3 = fadd <4 x double> %3, %6 - %res4 = fadd <4 x double> %res3, %7 - ret <4 x double> %res4 + ret <4 x double> %3 +} + +define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> zeroinitializer + ret <4 x double> %3 } declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) +define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + ret <4 x i64> %1 +} + define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X86: # %bb.0: -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X86-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X64: # %bb.0: -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X64-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> %x2 - %4 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> - %6 = select <4 x i1> %extract, <4 x i64> %4, <4 x i64> zeroinitializer - %7 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) - %res3 = add <4 x i64> %3, %6 - %res4 = add <4 x i64> %res3, %7 - ret <4 x i64> %res4 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> zeroinitializer + ret <4 x i64> %3 } declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8) From 1262db1b6a99d1ed79735e5ef377b1d2c23da3e7 Mon Sep 17 00:00:00 2001 From: Max Moroz Date: Thu, 19 Mar 2020 12:39:37 -0700 Subject: [PATCH 28/33] [compiler-rt] Refactor FuzzedDataProvider for better readability. Summary: Separate class definition and actual methods implementation. The main goal is to keep the list of available methods in a concise readable form inside the class definition. Reviewers: hctim, metzman Subscribers: dberris, #sanitizers Tags: #sanitizers Differential Revision: https://reviews.llvm.org/D76651 --- .../include/fuzzer/FuzzedDataProvider.h | 574 ++++++++++-------- 1 file changed, 316 insertions(+), 258 deletions(-) diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h index 9be9c884f0eac5..4945acf4547337 100644 --- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -27,300 +27,358 @@ // In addition to the comments below, the API is also briefly documented at // https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider class FuzzedDataProvider { - public: +public: // |data| is an array of length |size| that the FuzzedDataProvider wraps to // provide more granular access. |data| must outlive the FuzzedDataProvider. FuzzedDataProvider(const uint8_t *data, size_t size) : data_ptr_(data), remaining_bytes_(size) {} ~FuzzedDataProvider() = default; - // Returns a std::vector containing |num_bytes| of input data. If fewer than - // |num_bytes| of data remain, returns a shorter std::vector containing all - // of the data that's left. Can be used with any byte sized type, such as - // char, unsigned char, uint8_t, etc. - template std::vector ConsumeBytes(size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - return ConsumeBytes(num_bytes, num_bytes); - } + // See the implementation below (after the class definition) for more verbose + // comments for each of the methods. - // Similar to |ConsumeBytes|, but also appends the terminator value at the end - // of the resulting vector. Useful, when a mutable null-terminated C-string is - // needed, for example. But that is a rare case. Better avoid it, if possible, - // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. + // Methods returning std::vector of bytes. These are the most popular choice + // when splitting fuzzing input into pieces, as every piece is put into a + // separate buffer (i.e. ASan would catch any under-/overflow) and the memory + // will be released automatically. + template std::vector ConsumeBytes(size_t num_bytes); template - std::vector ConsumeBytesWithTerminator(size_t num_bytes, - T terminator = 0) { - num_bytes = std::min(num_bytes, remaining_bytes_); - std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); - result.back() = terminator; - return result; - } + std::vector ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0); + template std::vector ConsumeRemainingBytes(); - // Returns a std::string containing |num_bytes| of input data. Using this and - // |.c_str()| on the resulting string is the best way to get an immutable - // null-terminated C string. If fewer than |num_bytes| of data remain, returns - // a shorter std::string containing all of the data that's left. - std::string ConsumeBytesAsString(size_t num_bytes) { - static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), - "ConsumeBytesAsString cannot convert the data to a string."); - - num_bytes = std::min(num_bytes, remaining_bytes_); - std::string result( - reinterpret_cast(data_ptr_), - num_bytes); - Advance(num_bytes); - return result; - } + // Methods returning strings. Use only when you need a std::string or a null + // terminated C-string. Otherwise, prefer the methods returning std::vector. + std::string ConsumeBytesAsString(size_t num_bytes); + std::string ConsumeRandomLengthString(size_t max_length); + std::string ConsumeRandomLengthString(); + std::string ConsumeRemainingBytesAsString(); - // Returns a number in the range [min, max] by consuming bytes from the - // input data. The value might not be uniformly distributed in the given - // range. If there's no input data left, always returns |min|. |min| must - // be less than or equal to |max|. - template T ConsumeIntegralInRange(T min, T max) { - static_assert(std::is_integral::value, "An integral type is required."); - static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + // Methods returning integer values. + template T ConsumeIntegral(); + template T ConsumeIntegralInRange(T min, T max); - if (min > max) - abort(); + // Methods returning floating point values. + template T ConsumeFloatingPoint(); + template T ConsumeFloatingPointInRange(T min, T max); - // Use the biggest type possible to hold the range and the result. - uint64_t range = static_cast(max) - min; - uint64_t result = 0; - size_t offset = 0; - - while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && - remaining_bytes_ != 0) { - // Pull bytes off the end of the seed data. Experimentally, this seems to - // allow the fuzzer to more easily explore the input space. This makes - // sense, since it works by modifying inputs that caused new code to run, - // and this data is often used to encode length of data read by - // |ConsumeBytes|. Separating out read lengths makes it easier modify the - // contents of the data that is actually read. - --remaining_bytes_; - result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; - offset += CHAR_BIT; - } + // 0 <= return value <= 1. + template T ConsumeProbability(); - // Avoid division by 0, in case |range + 1| results in overflow. - if (range != std::numeric_limits::max()) - result = result % (range + 1); + bool ConsumeBool(); - return static_cast(min + result); - } + // Returns a value chosen from the given enum. + template T ConsumeEnum(); - // Returns a std::string of length from 0 to |max_length|. When it runs out of - // input data, returns what remains of the input. Designed to be more stable - // with respect to a fuzzer inserting characters than just picking a random - // length and then consuming that many bytes with |ConsumeBytes|. - std::string ConsumeRandomLengthString(size_t max_length) { - // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" - // followed by anything else to the end of the string. As a result of this - // logic, a fuzzer can insert characters into the string, and the string - // will be lengthened to include those new characters, resulting in a more - // stable fuzzer than picking the length of a string independently from - // picking its contents. - std::string result; - - // Reserve the anticipated capaticity to prevent several reallocations. - result.reserve(std::min(max_length, remaining_bytes_)); - for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { - char next = ConvertUnsignedToSigned(data_ptr_[0]); - Advance(1); - if (next == '\\' && remaining_bytes_ != 0) { - next = ConvertUnsignedToSigned(data_ptr_[0]); - Advance(1); - if (next != '\\') - break; - } - result += next; - } - - result.shrink_to_fit(); - return result; - } - - // Returns a std::string of length from 0 to |remaining_bytes_|. - std::string ConsumeRandomLengthString() { - return ConsumeRandomLengthString(remaining_bytes_); - } + // Returns a value from the given array. + template T PickValueInArray(const T (&array)[size]); + template T PickValueInArray(std::initializer_list list); - // Returns a std::vector containing all remaining bytes of the input data. - template std::vector ConsumeRemainingBytes() { - return ConsumeBytes(remaining_bytes_); - } + // Writes data to the given destination and returns number of bytes written. + size_t ConsumeData(void *destination, size_t num_bytes); - // Returns a std::string containing all remaining bytes of the input data. - // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string - // object. - std::string ConsumeRemainingBytesAsString() { - return ConsumeBytesAsString(remaining_bytes_); - } + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } - // Returns a number in the range [Type's min, Type's max]. The value might - // not be uniformly distributed in the given range. If there's no input data - // left, always returns |min|. - template T ConsumeIntegral() { - return ConsumeIntegralInRange(std::numeric_limits::min(), - std::numeric_limits::max()); - } +private: + FuzzedDataProvider(const FuzzedDataProvider &) = delete; + FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - // Reads one byte and returns a bool, or false when no data remains. - bool ConsumeBool() { return 1 & ConsumeIntegral(); } + void CopyAndAdvance(void *destination, size_t num_bytes); - // Returns a copy of the value selected from the given fixed-size |array|. - template - T PickValueInArray(const T (&array)[size]) { - static_assert(size > 0, "The array must be non empty."); - return array[ConsumeIntegralInRange(0, size - 1)]; - } + void Advance(size_t num_bytes); template - T PickValueInArray(std::initializer_list list) { - // TODO(Dor1s): switch to static_assert once C++14 is allowed. - if (!list.size()) - abort(); - - return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); - } + std::vector ConsumeBytes(size_t size, size_t num_bytes); - // Returns an enum value. The enum must start at 0 and be contiguous. It must - // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: - // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; - template T ConsumeEnum() { - static_assert(std::is_enum::value, "|T| must be an enum type."); - return static_cast(ConsumeIntegralInRange( - 0, static_cast(T::kMaxValue))); - } + template TS ConvertUnsignedToSigned(TU value); - // Returns a floating point number in the range [0.0, 1.0]. If there's no - // input data left, always returns 0. - template T ConsumeProbability() { - static_assert(std::is_floating_point::value, - "A floating point type is required."); - - // Use different integral types for different floating point types in order - // to provide better density of the resulting values. - using IntegralType = - typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, - uint64_t>::type; - - T result = static_cast(ConsumeIntegral()); - result /= static_cast(std::numeric_limits::max()); - return result; - } - - // Returns a floating point value in the range [Type's lowest, Type's max] by - // consuming bytes from the input data. If there's no input data left, always - // returns approximately 0. - template T ConsumeFloatingPoint() { - return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), - std::numeric_limits::max()); - } - - // Returns a floating point value in the given range by consuming bytes from - // the input data. If there's no input data left, returns |min|. Note that - // |min| must be less than or equal to |max|. - template T ConsumeFloatingPointInRange(T min, T max) { - if (min > max) - abort(); + const uint8_t *data_ptr_; + size_t remaining_bytes_; +}; - T range = .0; - T result = min; - constexpr T zero(.0); - if (max > zero && min < zero && max > min + std::numeric_limits::max()) { - // The diff |max - min| would overflow the given floating point type. Use - // the half of the diff as the range and consume a bool to decide whether - // the result is in the first of the second part of the diff. - range = (max / 2.0) - (min / 2.0); - if (ConsumeBool()) { - result += range; - } - } else { - range = max - min; +// Returns a std::vector containing |num_bytes| of input data. If fewer than +// |num_bytes| of data remain, returns a shorter std::vector containing all +// of the data that's left. Can be used with any byte sized type, such as +// char, unsigned char, uint8_t, etc. +template +std::vector FuzzedDataProvider::ConsumeBytes(size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + return ConsumeBytes(num_bytes, num_bytes); +} + +// Similar to |ConsumeBytes|, but also appends the terminator value at the end +// of the resulting vector. Useful, when a mutable null-terminated C-string is +// needed, for example. But that is a rare case. Better avoid it, if possible, +// and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. +template +std::vector FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes, + T terminator) { + num_bytes = std::min(num_bytes, remaining_bytes_); + std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); + result.back() = terminator; + return result; +} + +// Returns a std::vector containing all remaining bytes of the input data. +template +std::vector FuzzedDataProvider::ConsumeRemainingBytes() { + return ConsumeBytes(remaining_bytes_); +} + +// Returns a std::string containing |num_bytes| of input data. Using this and +// |.c_str()| on the resulting string is the best way to get an immutable +// null-terminated C string. If fewer than |num_bytes| of data remain, returns +// a shorter std::string containing all of the data that's left. +std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast(data_ptr_), num_bytes); + Advance(num_bytes); + return result; +} + +// Returns a std::string of length from 0 to |max_length|. When it runs out of +// input data, returns what remains of the input. Designed to be more stable +// with respect to a fuzzer inserting characters than just picking a random +// length and then consuming that many bytes with |ConsumeBytes|. +std::string FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + + // Reserve the anticipated capaticity to prevent several reallocations. + result.reserve(std::min(max_length, remaining_bytes_)); + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next != '\\') + break; } - - return result + range * ConsumeProbability(); + result += next; } - // Writes |num_bytes| of input data to the given destination pointer. If there - // is not enough data left, writes all remaining bytes. Return value is the - // number of bytes written. - // In general, it's better to avoid using this function, but it may be useful - // in cases when it's necessary to fill a certain buffer or object with - // fuzzing data. - size_t ConsumeData(void *destination, size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - CopyAndAdvance(destination, num_bytes); - return num_bytes; + result.shrink_to_fit(); + return result; +} + +// Returns a std::string of length from 0 to |remaining_bytes_|. +std::string FuzzedDataProvider::ConsumeRandomLengthString() { + return ConsumeRandomLengthString(remaining_bytes_); +} + +// Returns a std::string containing all remaining bytes of the input data. +// Prefer using |ConsumeRemainingBytes| unless you actually need a std::string +// object. +std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); +} + +// Returns a number in the range [Type's min, Type's max]. The value might +// not be uniformly distributed in the given range. If there's no input data +// left, always returns |min|. +template T FuzzedDataProvider::ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits::min(), + std::numeric_limits::max()); +} + +// Returns a number in the range [min, max] by consuming bytes from the +// input data. The value might not be uniformly distributed in the given +// range. If there's no input data left, always returns |min|. |min| must +// be less than or equal to |max|. +template +T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; } - // Reports the remaining bytes available for fuzzed input. - size_t remaining_bytes() { return remaining_bytes_; } - - private: - FuzzedDataProvider(const FuzzedDataProvider &) = delete; - FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - - void CopyAndAdvance(void *destination, size_t num_bytes) { - std::memcpy(destination, data_ptr_, num_bytes); - Advance(num_bytes); + // Avoid division by 0, in case |range + 1| results in overflow. + if (range != std::numeric_limits::max()) + result = result % (range + 1); + + return static_cast(min + result); +} + +// Returns a floating point value in the range [Type's lowest, Type's max] by +// consuming bytes from the input data. If there's no input data left, always +// returns approximately 0. +template T FuzzedDataProvider::ConsumeFloatingPoint() { + return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), + std::numeric_limits::max()); +} + +// Returns a floating point value in the given range by consuming bytes from +// the input data. If there's no input data left, returns |min|. Note that +// |min| must be less than or equal to |max|. +template +T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) { + if (min > max) + abort(); + + T range = .0; + T result = min; + constexpr T zero(.0); + if (max > zero && min < zero && max > min + std::numeric_limits::max()) { + // The diff |max - min| would overflow the given floating point type. Use + // the half of the diff as the range and consume a bool to decide whether + // the result is in the first of the second part of the diff. + range = (max / 2.0) - (min / 2.0); + if (ConsumeBool()) { + result += range; + } + } else { + range = max - min; } - void Advance(size_t num_bytes) { - if (num_bytes > remaining_bytes_) + return result + range * ConsumeProbability(); +} + +// Returns a floating point number in the range [0.0, 1.0]. If there's no +// input data left, always returns 0. +template T FuzzedDataProvider::ConsumeProbability() { + static_assert(std::is_floating_point::value, + "A floating point type is required."); + + // Use different integral types for different floating point types in order + // to provide better density of the resulting values. + using IntegralType = + typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, + uint64_t>::type; + + T result = static_cast(ConsumeIntegral()); + result /= static_cast(std::numeric_limits::max()); + return result; +} + +// Reads one byte and returns a bool, or false when no data remains. +bool FuzzedDataProvider::ConsumeBool() { + return 1 & ConsumeIntegral(); +} + +// Returns an enum value. The enum must start at 0 and be contiguous. It must +// also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: +// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; +template T FuzzedDataProvider::ConsumeEnum() { + static_assert(std::is_enum::value, "|T| must be an enum type."); + return static_cast( + ConsumeIntegralInRange(0, static_cast(T::kMaxValue))); +} + +// Returns a copy of the value selected from the given fixed-size |array|. +template +T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange(0, size - 1)]; +} + +template +T FuzzedDataProvider::PickValueInArray(std::initializer_list list) { + // TODO(Dor1s): switch to static_assert once C++14 is allowed. + if (!list.size()) + abort(); + + return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); +} + +// Writes |num_bytes| of input data to the given destination pointer. If there +// is not enough data left, writes all remaining bytes. Return value is the +// number of bytes written. +// In general, it's better to avoid using this function, but it may be useful +// in cases when it's necessary to fill a certain buffer or object with +// fuzzing data. +size_t FuzzedDataProvider::ConsumeData(void *destination, size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + CopyAndAdvance(destination, num_bytes); + return num_bytes; +} + +// Private methods. +void FuzzedDataProvider::CopyAndAdvance(void *destination, size_t num_bytes) { + std::memcpy(destination, data_ptr_, num_bytes); + Advance(num_bytes); +} + +void FuzzedDataProvider::Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; +} + +template +std::vector FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) { + static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natural choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector result(size); + if (size == 0) { + if (num_bytes != 0) abort(); - - data_ptr_ += num_bytes; - remaining_bytes_ -= num_bytes; - } - - template - std::vector ConsumeBytes(size_t size, size_t num_bytes) { - static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); - - // The point of using the size-based constructor below is to increase the - // odds of having a vector object with capacity being equal to the length. - // That part is always implementation specific, but at least both libc++ and - // libstdc++ allocate the requested number of bytes in that constructor, - // which seems to be a natural choice for other implementations as well. - // To increase the odds even more, we also call |shrink_to_fit| below. - std::vector result(size); - if (size == 0) { - if (num_bytes != 0) - abort(); - return result; - } - - CopyAndAdvance(result.data(), num_bytes); - - // Even though |shrink_to_fit| is also implementation specific, we expect it - // to provide an additional assurance in case vector's constructor allocated - // a buffer which is larger than the actual amount of data we put inside it. - result.shrink_to_fit(); return result; } - template TS ConvertUnsignedToSigned(TU value) { - static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); - static_assert(!std::numeric_limits::is_signed, - "Source type must be unsigned."); - - // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. - if (std::numeric_limits::is_modulo) - return static_cast(value); - - // Avoid using implementation-defined unsigned to signed conversions. - // To learn more, see https://stackoverflow.com/questions/13150449. - if (value <= std::numeric_limits::max()) { - return static_cast(value); - } else { - constexpr auto TS_min = std::numeric_limits::min(); - return TS_min + static_cast(value - TS_min); - } + CopyAndAdvance(result.data(), num_bytes); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; +} + +template +TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) { + static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); + static_assert(!std::numeric_limits::is_signed, + "Source type must be unsigned."); + + // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. + if (std::numeric_limits::is_modulo) + return static_cast(value); + + // Avoid using implementation-defined unsigned to signed conversions. + // To learn more, see https://stackoverflow.com/questions/13150449. + if (value <= std::numeric_limits::max()) { + return static_cast(value); + } else { + constexpr auto TS_min = std::numeric_limits::min(); + return TS_min + static_cast(value - TS_min); } - - const uint8_t *data_ptr_; - size_t remaining_bytes_; -}; +} #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ From 453eac3f779b562b68b5171d0b30481111c2b51e Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Wed, 25 Mar 2020 16:34:34 +0900 Subject: [PATCH 29/33] Minor fixes to a comment in CodeGenPrepare --- llvm/lib/CodeGen/CodeGenPrepare.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 1cccd6fade4efb..1fd5154cbf5be5 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7200,7 +7200,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { } if (FreezeInst *FI = dyn_cast(I)) { - // br(freeze(icmp a, const)) -> br(icmp (freeze a), const) + // freeze(icmp a, const)) -> icmp (freeze a), const // This helps generate efficient conditional jumps. Instruction *CmpI = nullptr; if (ICmpInst *II = dyn_cast(FI->getOperand(0))) From 1e65209e045598c3ca5833331215606a02267229 Mon Sep 17 00:00:00 2001 From: Max Moroz Date: Wed, 25 Mar 2020 00:39:17 -0700 Subject: [PATCH 30/33] [compiler-rt] FuzzedDataProvider: make linter happy. --- compiler-rt/include/fuzzer/FuzzedDataProvider.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h index 4945acf4547337..21d5fac457d62b 100644 --- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -27,7 +27,7 @@ // In addition to the comments below, the API is also briefly documented at // https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider class FuzzedDataProvider { -public: + public: // |data| is an array of length |size| that the FuzzedDataProvider wraps to // provide more granular access. |data| must outlive the FuzzedDataProvider. FuzzedDataProvider(const uint8_t *data, size_t size) @@ -79,7 +79,7 @@ class FuzzedDataProvider { // Reports the remaining bytes available for fuzzed input. size_t remaining_bytes() { return remaining_bytes_; } -private: + private: FuzzedDataProvider(const FuzzedDataProvider &) = delete; FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; From 0788acbccbec094903a3425ffe5a98f8d55cbd64 Mon Sep 17 00:00:00 2001 From: Haojian Wu Date: Tue, 24 Mar 2020 13:43:25 +0100 Subject: [PATCH 31/33] [AST] Build recovery expressions by default for C++. Update the existing tests. Reviewers: sammccall Subscribers: cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D76696 --- clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp | 4 +++- clang/include/clang/Basic/LangOptions.def | 2 +- clang/lib/Frontend/CompilerInvocation.cpp | 2 +- clang/lib/Sema/SemaExpr.cpp | 4 ++-- clang/test/OpenMP/target_update_from_messages.cpp | 6 +++--- clang/test/OpenMP/target_update_to_messages.cpp | 6 +++--- clang/test/Parser/objcxx0x-lambda-expressions.mm | 6 ++++-- clang/test/Parser/objcxx11-invalid-lambda.cpp | 9 +++++---- clang/test/SemaCXX/builtins.cpp | 4 ++-- clang/test/SemaCXX/cast-conversion.cpp | 2 +- clang/test/SemaCXX/cxx1z-copy-omission.cpp | 8 +++++--- clang/test/SemaCXX/decltype-crash.cpp | 5 ++++- clang/test/SemaCXX/varargs.cpp | 3 ++- clang/test/SemaOpenCLCXX/address-space-references.cl | 2 +- clang/test/SemaTemplate/instantiate-init.cpp | 2 +- clang/unittests/Sema/CodeCompleteTest.cpp | 3 ++- 16 files changed, 40 insertions(+), 28 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 0d6e37f0011840..f5c90a4677cb84 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -1195,7 +1195,9 @@ TEST(SignatureHelpTest, OpeningParen) { int foo(int a, int b, int c); int main() { #define ID(X) X - ID(foo $p^( foo(10), ^ )) + // FIXME: figure out why ID(foo (foo(10), )) doesn't work when preserving + // the recovery expression. + ID(foo $p^( 10, ^ )) })cpp"}; for (auto Test : Tests) { diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 6152e227d59981..22c4a87918e956 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -148,7 +148,7 @@ LANGOPT(RelaxedTemplateTemplateArgs, 1, 0, "C++17 relaxed matching of template t LANGOPT(DoubleSquareBracketAttributes, 1, 0, "'[[]]' attributes extension for all language standard modes") -COMPATIBLE_LANGOPT(RecoveryAST, 1, 0, "Preserve expressions in AST when encountering errors") +COMPATIBLE_LANGOPT(RecoveryAST, 1, CPlusPlus, "Preserve expressions in AST when encountering errors") BENIGN_LANGOPT(ThreadsafeStatics , 1, 1, "thread-safe static initializers") LANGOPT(POSIXThreads , 1, 0, "POSIX thread support") diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index dc5e932c9460a7..08e0700671a8d2 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -2909,7 +2909,7 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, if (Args.hasArg(OPT_fconcepts_ts)) Diags.Report(diag::warn_fe_concepts_ts_flag); Opts.RecoveryAST = - Args.hasFlag(OPT_frecovery_ast, OPT_fno_recovery_ast, false); + Args.hasFlag(OPT_frecovery_ast, OPT_fno_recovery_ast, Opts.CPlusPlus); Opts.HeinousExtensions = Args.hasArg(OPT_fheinous_gnu_extensions); Opts.AccessControl = !Args.hasArg(OPT_fno_access_control); Opts.ElideConstructors = !Args.hasArg(OPT_fno_elide_constructors); diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 2e1231c57c287c..152d177f147b5a 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -18437,8 +18437,8 @@ bool Sema::IsDependentFunctionNameExpr(Expr *E) { ExprResult Sema::CreateRecoveryExpr(SourceLocation Begin, SourceLocation End, ArrayRef SubExprs) { - // FIXME: enable it for C++, RecoveryExpr is type-dependent to suppress - // bogus diagnostics and this trick does not work in C. + // RecoveryExpr is type-dependent to suppress bogus diagnostics and this trick + // does not work in C. // FIXME: use containsErrors() to suppress unwanted diags in C. if (!Context.getLangOpts().RecoveryAST) return ExprError(); diff --git a/clang/test/OpenMP/target_update_from_messages.cpp b/clang/test/OpenMP/target_update_from_messages.cpp index eaf862a1058278..141cfc38ffeb12 100644 --- a/clang/test/OpenMP/target_update_from_messages.cpp +++ b/clang/test/OpenMP/target_update_from_messages.cpp @@ -74,7 +74,7 @@ struct S8 { #pragma omp target update from(*(this->S->i+this->S->s6[0].pp)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(*(a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(*(*(this->ptr)+a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update from(*(this+this)) // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} +#pragma omp target update from(*(this+this)) // expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} } }; @@ -198,8 +198,8 @@ int main(int argc, char **argv) { #pragma omp target update from(**(-(*offset)+BB+*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(**(*(*(&offset))+BB-*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(*(x+*(y+*(**BB+BBB)+s7.i))) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update from(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update from(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +#pragma omp target update from(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} +#pragma omp target update from(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} #pragma omp target data map(to: s7.i) { #pragma omp target update from(s7.x) diff --git a/clang/test/OpenMP/target_update_to_messages.cpp b/clang/test/OpenMP/target_update_to_messages.cpp index 66ba9d3bb23f32..832adc0dd4d094 100644 --- a/clang/test/OpenMP/target_update_to_messages.cpp +++ b/clang/test/OpenMP/target_update_to_messages.cpp @@ -77,7 +77,7 @@ struct S8 { #pragma omp target update to(*(this->S->i+this->S->s6[0].pp)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(*(a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(*(*(this->ptr)+a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update to(*(this+this)) // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} +#pragma omp target update to(*(this+this)) // expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} {} } }; @@ -205,8 +205,8 @@ int main(int argc, char **argv) { #pragma omp target update to(**(*offset+BB+*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(**(*(*(&offset))+BB+*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(*(x+*(y+*(**BB+BBB)+s7.i))) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update to(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update to(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +#pragma omp target update to(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} +#pragma omp target update to(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} {} return tmain(argc)+tmain(argc); // expected-note {{in instantiation of function template specialization 'tmain' requested here}} expected-note {{in instantiation of function template specialization 'tmain' requested here}} } diff --git a/clang/test/Parser/objcxx0x-lambda-expressions.mm b/clang/test/Parser/objcxx0x-lambda-expressions.mm index 0f3e9481a72207..396313816fe5dd 100644 --- a/clang/test/Parser/objcxx0x-lambda-expressions.mm +++ b/clang/test/Parser/objcxx0x-lambda-expressions.mm @@ -11,7 +11,8 @@ void f() { []; // expected-error {{expected body of lambda expression}} [=,foo+] {}; // expected-error {{expected ',' or ']' in lambda capture list}} - [&this] {}; // expected-error {{cannot take the address of an rvalue of type 'C *'}} + [&this] {}; // expected-error {{cannot take the address of an rvalue of type 'C *'}} \ + // expected-error {{expected identifier}} [] {}; [=] (int i) {}; [&] (int) mutable -> void {}; @@ -24,7 +25,8 @@ void f() { [foo{bar}] () {}; [foo = {bar}] () {}; // expected-error {{}} - [foo(bar) baz] () {}; // expected-error {{called object type 'int' is not a function}} + [foo(bar) baz] () {}; // expected-error {{called object type 'int' is not a function}} \ + // expected-error {{expected ';'}} [foo(bar), baz] () {}; // ok [foo = bar baz]; // expected-warning {{receiver type 'int'}} expected-warning {{instance method '-baz'}} diff --git a/clang/test/Parser/objcxx11-invalid-lambda.cpp b/clang/test/Parser/objcxx11-invalid-lambda.cpp index bdb4e880fd0b76..221cae9b907093 100644 --- a/clang/test/Parser/objcxx11-invalid-lambda.cpp +++ b/clang/test/Parser/objcxx11-invalid-lambda.cpp @@ -1,10 +1,11 @@ // RUN: %clang_cc1 -fsyntax-only -verify -x objective-c++ -std=c++11 %s -void foo() { // expected-note {{to match this '{'}} +void foo() { int bar; auto baz = [ - bar( // expected-note {{to match this '('}} expected-note {{to match this '('}} + bar( // expected-note 2{{to match this '('}}\ + // expected-warning {{captures are a C++14 extension}} foo_undeclared() // expected-error{{use of undeclared identifier 'foo_undeclared'}} /* ) */ - ] () { }; // expected-error{{expected ')'}} -} // expected-error{{expected ')'}} expected-error {{expected ',' or ']'}} expected-error{{expected ';' at end of declaration}} expected-error{{expected '}'}} + ] () { }; // expected-error 2{{expected ')'}} +} \ No newline at end of file diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp index fbe2c457dad979..5306f4fad83b2d 100644 --- a/clang/test/SemaCXX/builtins.cpp +++ b/clang/test/SemaCXX/builtins.cpp @@ -14,8 +14,8 @@ template int equal(const char *s1, const char *s2) { return Compare(s1, s2) == 0; } -// FIXME: Our error recovery here sucks -template int equal<&__builtin_strcmp>(const char*, const char*); // expected-error {{builtin functions must be directly called}} expected-error {{expected unqualified-id}} expected-error {{expected ')'}} expected-note {{to match this '('}} + +template int equal<&__builtin_strcmp>(const char*, const char*); // expected-error {{builtin functions must be directly called}} // PR13195 void f2() { diff --git a/clang/test/SemaCXX/cast-conversion.cpp b/clang/test/SemaCXX/cast-conversion.cpp index 4d5abfdcfb1e3f..e48d918f889bcb 100644 --- a/clang/test/SemaCXX/cast-conversion.cpp +++ b/clang/test/SemaCXX/cast-conversion.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -triple x86_64-unknown-unknown -verify %s -std=c++11 +// RUN: %clang_cc1 -fsyntax-only -triple x86_64-unknown-unknown -verify %s -std=c++11 -Wno-unused struct R { R(int); diff --git a/clang/test/SemaCXX/cxx1z-copy-omission.cpp b/clang/test/SemaCXX/cxx1z-copy-omission.cpp index a7133d79b463f0..eceac810e72a50 100644 --- a/clang/test/SemaCXX/cxx1z-copy-omission.cpp +++ b/clang/test/SemaCXX/cxx1z-copy-omission.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=c++1z -verify %s +// RUN: %clang_cc1 -std=c++1z -verify -Wno-unused %s struct Noncopyable { Noncopyable(); @@ -107,8 +107,10 @@ void test_expressions(bool b) { sizeof(make_indestructible()); // expected-error {{deleted}} sizeof(make_incomplete()); // expected-error {{incomplete}} typeid(Indestructible{}); // expected-error {{deleted}} - typeid(make_indestructible()); // expected-error {{deleted}} - typeid(make_incomplete()); // expected-error {{incomplete}} + typeid(make_indestructible()); // expected-error {{deleted}} \ + // expected-error {{need to include }} + typeid(make_incomplete()); // expected-error {{incomplete}} \ + // expected-error {{need to include }} // FIXME: The first two cases here are now also valid in C++17 onwards. using I = decltype(Indestructible()); // expected-error {{deleted}} diff --git a/clang/test/SemaCXX/decltype-crash.cpp b/clang/test/SemaCXX/decltype-crash.cpp index 1cebfcd72c8619..1ffc525a270378 100644 --- a/clang/test/SemaCXX/decltype-crash.cpp +++ b/clang/test/SemaCXX/decltype-crash.cpp @@ -3,5 +3,8 @@ int& a(); void f() { - decltype(a()) c; // expected-warning {{'decltype' is a keyword in C++11}} expected-error {{use of undeclared identifier 'decltype'}} + decltype(a()) c; // expected-warning {{'decltype' is a keyword in C++11}} \ + // expected-error {{use of undeclared identifier 'decltype'}} \ + // expected-error {{expected ';' after expression}} \ + // expected-error {{use of undeclared identifier 'c'}} } diff --git a/clang/test/SemaCXX/varargs.cpp b/clang/test/SemaCXX/varargs.cpp index f2f53dc2001fd0..625f1dce18f3df 100644 --- a/clang/test/SemaCXX/varargs.cpp +++ b/clang/test/SemaCXX/varargs.cpp @@ -22,7 +22,8 @@ void no_params(...) { // default ctor. void record_context(int a, ...) { struct Foo { - // expected-error@+1 {{'va_start' cannot be used outside a function}} + // expected-error@+2 {{'va_start' cannot be used outside a function}} + // expected-error@+1 {{default argument references parameter 'a'}} void meth(int a, int b = (__builtin_va_start(ap, a), 0)) {} }; } diff --git a/clang/test/SemaOpenCLCXX/address-space-references.cl b/clang/test/SemaOpenCLCXX/address-space-references.cl index 068318dfa1415f..66cd1c02e32f9b 100644 --- a/clang/test/SemaOpenCLCXX/address-space-references.cl +++ b/clang/test/SemaOpenCLCXX/address-space-references.cl @@ -11,7 +11,7 @@ int bar(const __global unsigned int &i); // expected-note{{passing argument to p int bar(const unsigned int &i); void foo() { - bar(1) // expected-error{{binding reference of type 'const __global unsigned int' to value of type 'int' changes address space}} + bar(1); // expected-error{{binding reference of type 'const __global unsigned int' to value of type 'int' changes address space}} } // Test addr space conversion with nested pointers diff --git a/clang/test/SemaTemplate/instantiate-init.cpp b/clang/test/SemaTemplate/instantiate-init.cpp index 99b29c77d55af9..69f4d5fc4cc463 100644 --- a/clang/test/SemaTemplate/instantiate-init.cpp +++ b/clang/test/SemaTemplate/instantiate-init.cpp @@ -100,7 +100,7 @@ namespace PR7985 { integral_c<1> ic1 = array_lengthof(Description::data); (void)sizeof(array_lengthof(Description::data)); - sizeof(array_lengthof( // expected-error{{no matching function for call to 'array_lengthof'}} + (void)sizeof(array_lengthof( // expected-error{{no matching function for call to 'array_lengthof'}} Description::data // expected-note{{in instantiation of static data member 'PR7985::Description::data' requested here}} )); diff --git a/clang/unittests/Sema/CodeCompleteTest.cpp b/clang/unittests/Sema/CodeCompleteTest.cpp index a9441a679cac71..5c8940cf6ebaeb 100644 --- a/clang/unittests/Sema/CodeCompleteTest.cpp +++ b/clang/unittests/Sema/CodeCompleteTest.cpp @@ -487,6 +487,7 @@ TEST(PreferredTypeTest, NoCrashOnInvalidTypes) { auto x = decltype(&1)(^); auto y = new decltype(&1)(^); )cpp"; - EXPECT_THAT(collectPreferredTypes(Code), Each("NULL TYPE")); + EXPECT_THAT(collectPreferredTypes(Code), + Each("decltype((1))")); } } // namespace From 28a42dd1b9e964dba39d92b1a7af2317e2d16e95 Mon Sep 17 00:00:00 2001 From: "Kazushi (Jam) Marukawa" Date: Wed, 25 Mar 2020 09:19:52 +0100 Subject: [PATCH 32/33] [VE] Change name of enum to CondCode Summary: Change enum name for condition codes from CondCodes to CondCode. Reviewers: arsenm, simoll, k-ishizaka Reviewed By: arsenm Subscribers: wdng, hiraditya, llvm-commits Tags: #llvm, #ve Differential Revision: https://reviews.llvm.org/D76747 --- llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp | 2 +- llvm/lib/Target/VE/VE.h | 4 ++-- llvm/lib/Target/VE/VEInstrInfo.cpp | 4 ++-- llvm/lib/Target/VE/VEInstrInfo.td | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp index 0a9e27462c0b4a..1229a8c6dc7113 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp @@ -113,5 +113,5 @@ void VEInstPrinter::printMemASOperand(const MCInst *MI, int opNum, void VEInstPrinter::printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, raw_ostream &O) { int CC = (int)MI->getOperand(opNum).getImm(); - O << VECondCodeToString((VECC::CondCodes)CC); + O << VECondCodeToString((VECC::CondCode)CC); } diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h index 03172782796661..e5a5b87adf7275 100644 --- a/llvm/lib/Target/VE/VE.h +++ b/llvm/lib/Target/VE/VE.h @@ -37,7 +37,7 @@ namespace llvm { // Enums corresponding to VE condition codes, both icc's and fcc's. These // values must be kept in sync with the ones in the .td file. namespace VECC { -enum CondCodes { +enum CondCode { // Integer comparison CC_IG = 0, // Greater CC_IL = 1, // Less @@ -66,7 +66,7 @@ enum CondCodes { }; } -inline static const char *VECondCodeToString(VECC::CondCodes CC) { +inline static const char *VECondCodeToString(VECC::CondCode CC) { switch (CC) { case VECC::CC_IG: return "gt"; case VECC::CC_IL: return "lt"; diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index 89c618544e3894..cc24f2e29c5973 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -40,7 +40,7 @@ VEInstrInfo::VEInstrInfo(VESubtarget &ST) static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } -static VECC::CondCodes GetOppositeBranchCondition(VECC::CondCodes CC) { +static VECC::CondCode GetOppositeBranchCondition(VECC::CondCode CC) { switch(CC) { case VECC::CC_IG: return VECC::CC_ILE; case VECC::CC_IL: return VECC::CC_IGE; @@ -268,7 +268,7 @@ unsigned VEInstrInfo::removeBranch(MachineBasicBlock &MBB, bool VEInstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { - VECC::CondCodes CC = static_cast(Cond[0].getImm()); + VECC::CondCode CC = static_cast(Cond[0].getImm()); Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index 58244540ccd27b..38dfb28cae1b54 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -83,7 +83,7 @@ def HI32 : SDNodeXForm; def icond2cc : SDNodeXFormget()) { default: llvm_unreachable("Unknown integer condition code!"); case ISD::SETEQ: cc = VECC::CC_IEQ; break; @@ -101,7 +101,7 @@ def icond2cc : SDNodeXForm; def fcond2cc : SDNodeXFormget()) { default: llvm_unreachable("Unknown float condition code!"); case ISD::SETFALSE: cc = VECC::CC_AF; break; From e951a4899626bc32612c44f6b13a18e43b0a8260 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Wed, 25 Mar 2020 17:18:19 +0900 Subject: [PATCH 33/33] Add freeze(and x, const) case to codegenprepare's freeze-cmp.ll --- .../CodeGenPrepare/X86/freeze-cmp.ll | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll index d0d40bc91a9336..cb6cc8d9108b8d 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll @@ -105,5 +105,31 @@ define i1 @fcmp_nan(float %a) { ret i1 %fr } +define void @and(i32 %flag) { +; CHECK-LABEL: @and( +; CHECK-NEXT: [[V:%.*]] = and i32 [[FLAG:%.*]], 1 +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[V]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[FR]], 0 +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %v = and i32 %flag, 1 + %c = icmp eq i32 %v, 0 + %fr = freeze i1 %c + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + + declare void @g1() declare void @g2()