From 413577a8790407d75ba834fa5668c2632fe1851e Mon Sep 17 00:00:00 2001 From: Xiang1 Zhang Date: Wed, 30 Sep 2020 18:01:15 +0800 Subject: [PATCH 1/8] [X86] Support Intel Key Locker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Key Locker provides a mechanism to encrypt and decrypt data with an AES key without having access to the raw key value by converting AES keys into “handles”. These handles can be used to perform the same encryption and decryption operations as the original AES keys, but they only work on the current system and only until they are revoked. If software revokes Key Locker handles (e.g., on a reboot), then any previous handles can no longer be used. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D88398 --- clang/include/clang/Basic/BuiltinsX86.def | 19 + clang/include/clang/Driver/Options.td | 4 + clang/lib/Basic/Targets/X86.cpp | 12 + clang/lib/Basic/Targets/X86.h | 2 + clang/lib/CodeGen/CGBuiltin.cpp | 87 +++++ clang/lib/Headers/CMakeLists.txt | 2 + clang/lib/Headers/immintrin.h | 10 + clang/lib/Headers/keylocker_wide_intrin.h | 259 +++++++++++++ clang/lib/Headers/keylockerintrin.h | 343 ++++++++++++++++++ clang/test/CodeGen/X86/keylocker.c | 72 ++++ clang/test/CodeGen/attr-target-x86.c | 2 +- clang/test/Driver/x86-target-features.c | 10 + clang/test/Preprocessor/x86_target_features.c | 19 + llvm/include/llvm/IR/IntrinsicsX86.td | 53 +++ llvm/include/llvm/Support/X86TargetParser.def | 2 + llvm/lib/IR/Function.cpp | 4 +- llvm/lib/Support/Host.cpp | 5 + llvm/lib/Support/X86TargetParser.cpp | 6 +- llvm/lib/Target/X86/X86.td | 6 + llvm/lib/Target/X86/X86ISelLowering.cpp | 179 +++++++++ llvm/lib/Target/X86/X86InstrInfo.td | 5 + llvm/lib/Target/X86/X86InstrInfo.td.rej | 11 + llvm/lib/Target/X86/X86InstrKL.td | 66 ++++ llvm/lib/Target/X86/X86Subtarget.h | 8 + llvm/test/CodeGen/X86/keylocker-intrinsics.ll | 312 ++++++++++++++++ .../X86/KEYLOCKER/Keylocker-x86-32-att.txt | 276 ++++++++++++++ .../X86/KEYLOCKER/Keylocker-x86-32-intel.txt | 223 ++++++++++++ .../X86/KEYLOCKER/Keylocker-x86-64-att.txt | 277 ++++++++++++++ .../X86/KEYLOCKER/Keylocker-x86-64-intel.txt | 223 ++++++++++++ llvm/test/MC/X86/KEYLOCKER/keylocker-att.s | 205 +++++++++++ llvm/test/MC/X86/KEYLOCKER/keylocker-intel.s | 205 +++++++++++ .../MC/X86/KEYLOCKER/x86-64-keylocker-att.s | 205 +++++++++++ .../MC/X86/KEYLOCKER/x86-64-keylocker-intel.s | 205 +++++++++++ llvm/utils/TableGen/IntrinsicEmitter.cpp | 4 +- 34 files changed, 3317 insertions(+), 4 deletions(-) create mode 100644 clang/lib/Headers/keylocker_wide_intrin.h create mode 100644 clang/lib/Headers/keylockerintrin.h create mode 100644 clang/test/CodeGen/X86/keylocker.c create mode 100644 llvm/lib/Target/X86/X86InstrInfo.td.rej create mode 100644 llvm/lib/Target/X86/X86InstrKL.td create mode 100644 llvm/test/CodeGen/X86/keylocker-intrinsics.ll create mode 100644 llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-att.txt create mode 100644 llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-intel.txt create mode 100644 llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-att.txt create mode 100644 llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-intel.txt create mode 100644 llvm/test/MC/X86/KEYLOCKER/keylocker-att.s create mode 100644 llvm/test/MC/X86/KEYLOCKER/keylocker-intel.s create mode 100644 llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-att.s create mode 100644 llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-intel.s diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 35fb98352ec2be..e212d0a2a0cca5 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -1900,6 +1900,25 @@ TARGET_BUILTIN(__builtin_ia32_invpcid, "vUiv*", "nc", "invpcid") TARGET_BUILTIN(__builtin_ia32_enqcmd, "Ucv*vC*", "n", "enqcmd") TARGET_BUILTIN(__builtin_ia32_enqcmds, "Ucv*vC*", "n", "enqcmd") +// KEY LOCKER +TARGET_BUILTIN(__builtin_ia32_loadiwkey, "vUiV2OiV2OiV2Oi", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_encodekey128, + "UiUiV2OiV2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_encodekey256, + "UiUiV2OiV2OiV2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_aesenc128kl, "UcV2Oi*V2OivC*", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_aesenc256kl, "UcV2Oi*V2OivC*", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_aesdec128kl, "UcV2Oi*V2OivC*", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_aesdec256kl, "UcV2Oi*V2OivC*", "nV:128:", "kl") +TARGET_BUILTIN(__builtin_ia32_aesencwide128kl, + "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl") +TARGET_BUILTIN(__builtin_ia32_aesencwide256kl, + "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl") +TARGET_BUILTIN(__builtin_ia32_aesdecwide128kl, + "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl") +TARGET_BUILTIN(__builtin_ia32_aesdecwide256kl, + "UcvC*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2Oi*V2OiV2OiV2OiV2OiV2OiV2OiV2OiV2Oi", "nV:128:", "kl,widekl") + // SERIALIZE TARGET_BUILTIN(__builtin_ia32_serialize, "v", "n", "serialize") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 09fdf50b1cb80c..672a833c9d4da1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3253,6 +3253,10 @@ def minvpcid : Flag<["-"], "minvpcid">, Group; def mno_invpcid : Flag<["-"], "mno-invpcid">, Group; def mgfni : Flag<["-"], "mgfni">, Group; def mno_gfni : Flag<["-"], "mno-gfni">, Group; +def mkl : Flag<["-"], "mkl">, Group; +def mno_kl : Flag<["-"], "mno-kl">, Group; +def mwidekl : Flag<["-"], "mwidekl">, Group; +def mno_widekl : Flag<["-"], "mno-widekl">, Group; def mlwp : Flag<["-"], "mlwp">, Group; def mno_lwp : Flag<["-"], "mno-lwp">, Group; def mlzcnt : Flag<["-"], "mlzcnt">, Group; diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index b829dfac74fbf1..5d89894c762836 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -276,6 +276,10 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasCLDEMOTE = true; } else if (Feature == "+rdpid") { HasRDPID = true; + } else if (Feature == "+kl") { + HasKL = true; + } else if (Feature == "+widekl") { + HasWIDEKL = true; } else if (Feature == "+retpoline-external-thunk") { HasRetpolineExternalThunk = true; } else if (Feature == "+sahf") { @@ -678,6 +682,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__PREFETCHWT1__"); if (HasCLZERO) Builder.defineMacro("__CLZERO__"); + if (HasKL) + Builder.defineMacro("__KL__"); + if (HasWIDEKL) + Builder.defineMacro("__WIDEKL__"); if (HasRDPID) Builder.defineMacro("__RDPID__"); if (HasCLDEMOTE) @@ -833,6 +841,8 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("fxsr", true) .Case("gfni", true) .Case("invpcid", true) + .Case("kl", true) + .Case("widekl", true) .Case("lwp", true) .Case("lzcnt", true) .Case("mmx", true) @@ -919,6 +929,8 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("fxsr", HasFXSR) .Case("gfni", HasGFNI) .Case("invpcid", HasINVPCID) + .Case("kl", HasKL) + .Case("widekl", HasWIDEKL) .Case("lwp", HasLWP) .Case("lzcnt", HasLZCNT) .Case("mm3dnow", MMX3DNowLevel >= AMD3DNow) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 25dc9458c25a6e..7b2b7dcf64604e 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -127,6 +127,8 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasPTWRITE = false; bool HasINVPCID = false; bool HasENQCMD = false; + bool HasKL = false; // For key locker + bool HasWIDEKL = false; // For wide key locker bool HasAMXTILE = false; bool HasAMXINT8 = false; bool HasAMXBF16 = false; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 57804494d9a51b..bb1c1d1aef3387 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14037,6 +14037,93 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_psubusb128: case X86::BI__builtin_ia32_psubusw128: return EmitX86BinaryIntrinsic(*this, Ops, Intrinsic::usub_sat); + case X86::BI__builtin_ia32_encodekey128: + case X86::BI__builtin_ia32_encodekey256: + case X86::BI__builtin_ia32_aesenc128kl: + case X86::BI__builtin_ia32_aesdec128kl: + case X86::BI__builtin_ia32_aesenc256kl: + case X86::BI__builtin_ia32_aesdec256kl: + case X86::BI__builtin_ia32_aesencwide128kl: + case X86::BI__builtin_ia32_aesdecwide128kl: + case X86::BI__builtin_ia32_aesencwide256kl: + case X86::BI__builtin_ia32_aesdecwide256kl: { + int FirstReturnOp; + int ResultCount; + SmallVector InOps; + unsigned ID; + + switch (BuiltinID) { + default: llvm_unreachable("Unsupported intrinsic!"); + case X86::BI__builtin_ia32_encodekey128: + ID = Intrinsic::x86_encodekey128; + InOps = {Ops[0], Ops[1]}; + FirstReturnOp = 2; + ResultCount = 6; + break; + case X86::BI__builtin_ia32_encodekey256: + ID = Intrinsic::x86_encodekey256; + InOps = {Ops[0], Ops[1], Ops[2]}; + FirstReturnOp = 3; + ResultCount = 7; + break; + case X86::BI__builtin_ia32_aesenc128kl: + case X86::BI__builtin_ia32_aesdec128kl: + case X86::BI__builtin_ia32_aesenc256kl: + case X86::BI__builtin_ia32_aesdec256kl: { + InOps = {Ops[1], Ops[2]}; + FirstReturnOp = 0; + ResultCount = 1; + switch (BuiltinID) { + case X86::BI__builtin_ia32_aesenc128kl: + ID = Intrinsic::x86_aesenc128kl; + break; + case X86::BI__builtin_ia32_aesdec128kl: + ID = Intrinsic::x86_aesdec128kl; + break; + case X86::BI__builtin_ia32_aesenc256kl: + ID = Intrinsic::x86_aesenc256kl; + break; + case X86::BI__builtin_ia32_aesdec256kl: + ID = Intrinsic::x86_aesdec256kl; + break; + } + break; + } + case X86::BI__builtin_ia32_aesencwide128kl: + case X86::BI__builtin_ia32_aesdecwide128kl: + case X86::BI__builtin_ia32_aesencwide256kl: + case X86::BI__builtin_ia32_aesdecwide256kl: { + InOps = {Ops[0], Ops[9], Ops[10], Ops[11], Ops[12], Ops[13], + Ops[14], Ops[15], Ops[16]}; + FirstReturnOp = 1; + ResultCount = 8; + switch (BuiltinID) { + case X86::BI__builtin_ia32_aesencwide128kl: + ID = Intrinsic::x86_aesencwide128kl; + break; + case X86::BI__builtin_ia32_aesdecwide128kl: + ID = Intrinsic::x86_aesdecwide128kl; + break; + case X86::BI__builtin_ia32_aesencwide256kl: + ID = Intrinsic::x86_aesencwide256kl; + break; + case X86::BI__builtin_ia32_aesdecwide256kl: + ID = Intrinsic::x86_aesdecwide256kl; + break; + } + break; + } + } + + Value *Call = Builder.CreateCall(CGM.getIntrinsic(ID), InOps); + + for (int i = 0; i < ResultCount; ++i) { + Builder.CreateDefaultAlignedStore(Builder.CreateExtractValue(Call, i + 1), + Ops[FirstReturnOp + i]); + } + + return Builder.CreateExtractValue(Call, 0); + } } } diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index a9761f04906754..8c12d5ab935d8b 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -72,6 +72,8 @@ set(files inttypes.h invpcidintrin.h iso646.h + keylockerintrin.h + keylocker_wide_intrin.h limits.h lwpintrin.h lzcntintrin.h diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index e9dff2310fdf7f..1beade1be24840 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -471,6 +471,16 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__KL__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__WIDEKL__) +#include +#endif + #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__) #include diff --git a/clang/lib/Headers/keylocker_wide_intrin.h b/clang/lib/Headers/keylocker_wide_intrin.h new file mode 100644 index 00000000000000..9b6c9ccab811f1 --- /dev/null +++ b/clang/lib/Headers/keylocker_wide_intrin.h @@ -0,0 +1,259 @@ +/*===-------------- keylocker_wide_intrin.h - KL_WIDE Intrinsics ------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _KEYLOCKERINTRIN_WIDE_H +#define _KEYLOCKERINTRIN_WIDE_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\ + __min_vector_width__(128))) + +/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENCWIDE128KL instructions. +/// +/// \operation +/// Handle := MEM[__h+383:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesencwide128kl(__h, + __odata, + __odata + 1, + __odata + 2, + __odata + 3, + __odata + 4, + __odata + 5, + __odata + 6, + __odata + 7, + __idata[0], + __idata[1], + __idata[2], + __idata[3], + __idata[4], + __idata[5], + __idata[6], + __idata[7]); +} + +/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENCWIDE256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesencwide256kl(__h, + __odata, + __odata + 1, + __odata + 2, + __odata + 3, + __odata + 4, + __odata + 5, + __odata + 6, + __odata + 7, + __idata[0], + __idata[1], + __idata[2], + __idata[3], + __idata[4], + __idata[5], + __idata[6], + __idata[7]); +} + +/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDECWIDE128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesdecwide128kl(__h, + __odata, + __odata + 1, + __odata + 2, + __odata + 3, + __odata + 4, + __odata + 5, + __odata + 6, + __odata + 7, + __idata[0], + __idata[1], + __idata[2], + __idata[3], + __idata[4], + __idata[5], + __idata[6], + __idata[7]); +} + +/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDECWIDE256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 ) +/// If (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesdecwide256kl(__h, + __odata, + __odata + 1, + __odata + 2, + __odata + 3, + __odata + 4, + __odata + 5, + __odata + 6, + __odata + 7, + __idata[0], + __idata[1], + __idata[2], + __idata[3], + __idata[4], + __idata[5], + __idata[6], + __idata[7]); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif /* _KEYLOCKERINTRIN_WIDE_H */ diff --git a/clang/lib/Headers/keylockerintrin.h b/clang/lib/Headers/keylockerintrin.h new file mode 100644 index 00000000000000..5bd4fe59c6be0b --- /dev/null +++ b/clang/lib/Headers/keylockerintrin.h @@ -0,0 +1,343 @@ +/*===----------------- keylockerintrin.h - KL Intrinsics -------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _KEYLOCKERINTRIN_H +#define _KEYLOCKERINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("kl"),\ + __min_vector_width__(128))) + +/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl +/// will assigned to EAX, whch specifies the KeySource and whether backing up +/// the key is permitted. The 256-bit encryption key is loaded from the two +/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is +/// loaded from the implicit operand XMM0 which assigned by __intkey. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LOADIWKEY instructions. +/// +/// \operation +/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode) +/// GP (0) +/// FI +/// IF “LOADIWKEY exiting” VM execution control set +/// VMexit +/// FI +/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used +/// GP (0) +/// FI +/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set +/// GP (0) +/// FI +/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part +/// GP (0) +/// FI +/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part +/// GP (0) +/// FI +/// IF (__ctl[4:1] == 0) // KeySource of 0. +/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]: +/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] +/// IWKey.IntegrityKey[127:0] := __intkey[127:0] +/// IWKey.NoBackup := __ctl[0] +/// IWKey.KeySource := __ctl[4:1] +/// ZF := 0 +/// ELSE // KeySource of 1. See RDSEED definition for details of randomness +/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received +/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0] +/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128] +/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0] +/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256] +/// IWKey.NoBackup := __ctl[0] +/// IWKey.KeySource := __ctl[4:1] +/// ZF := 0 +/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded +/// ZF := 1 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_mm_loadiwkey (unsigned int __ctl, __m128i __intkey, + __m128i __enkey_lo, __m128i __enkey_hi) { + __builtin_ia32_loadiwkey (__ctl, __intkey, __enkey_lo, __enkey_hi); +} + +/// Wrap a 128-bit AES key from __key into a key handle and output in +/// ((__m128i*)__h) to ((__m128i*)__h) + 5 and a 32-bit value as return. +/// The explicit source operand __htype specifies handle restrictions. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ENCODEKEY128 instructions. +/// +/// \operation +/// InputKey[127:0] := __key[127:0] +/// KeyMetadata[2:0] := __htype[2:0] +/// KeyMetadata[23:3] := 0 // Reserved for future usage +/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0) +/// KeyMetadata[127:28] := 0 // Reserved for future usage +/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0], +/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) +/// dst[0] := IWKey.NoBackup +/// dst[4:1] := IWKey.KeySource[3:0] +/// dst[31:5] := 0 +/// MEM[__h+127:__h] := Handle[127:0] // AAD +/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag +/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText +/// MEM[__h+511:__h+384] := 0 // Reserved for future usage +/// MEM[__h+639:__h+512] := 0 // Reserved for future usage +/// MEM[__h+767:__h+640] := 0 // Reserved for future usage +/// OF := 0 +/// SF := 0 +/// ZF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) { + __m128i *__results = (__m128i*)__h; + + return __builtin_ia32_encodekey128(__htype, __key, + __results, + __results + 1, + __results + 2, + __results + 3, + __results + 4, + __results + 5); +} + +/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then +/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 6 and +/// a 32-bit value as return. +/// The explicit source operand __htype specifies handle restrictions. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ENCODEKEY256 instructions. +/// +/// \operation +/// InputKey[127:0] := __key_lo[127:0] +/// InputKey[255:128] := __key_hi[255:128] +/// KeyMetadata[2:0] := __htype[2:0] +/// KeyMetadata[23:3] := 0 // Reserved for future usage +/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1) +/// KeyMetadata[127:28] := 0 // Reserved for future usage +/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0], +/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) +/// dst[0] := IWKey.NoBackup +/// dst[4:1] := IWKey.KeySource[3:0] +/// dst[31:5] := 0 +/// MEM[__h+127:__h] := Handle[127:0] // AAD +/// MEM[__h+255:__h+128] := Handle[255:128] // Tag +/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0] +/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128] +/// MEM[__h+639:__h+512] := 0 // Reserved for future usage +/// MEM[__h+767:__h+640] := 0 // Reserved for future usage +/// MEM[__h+895:__h+768] := 0 Integrity// Reserved for future usage +/// OF := 0 +/// SF := 0 +/// ZF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi, + void *__h) { + __m128i *__results = (__m128i*)__h; + + return __builtin_ia32_encodekey256(__htype, __key_lo, __key_hi, + __results, + __results + 1, + __results + 2, + __results + 3, + __results + 4, + __results + 5, + __results + 6); +} + +/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using +/// the 128-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENC128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesenc128kl(__odata, __idata, __h); +} + +/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using +/// the 256-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENC256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesenc256kl(__odata, __idata, __h); +} + +/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using +/// the 128-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDEC128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesdec128kl(__odata, __idata, __h); +} + +/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using +/// the 256-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDEC256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesdec256kl(__odata, __idata, __h); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* _KEYLOCKERINTRIN_H */ diff --git a/clang/test/CodeGen/X86/keylocker.c b/clang/test/CodeGen/X86/keylocker.c new file mode 100644 index 00000000000000..835bdd279ef1fe --- /dev/null +++ b/clang/test/CodeGen/X86/keylocker.c @@ -0,0 +1,72 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +kl -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 %s -ffreestanding -triple=i386-unknown-unknown -target-feature +widekl -emit-llvm -o - -Wall -Werror | FileCheck %s + +#include + +void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) { + //CHECK-LABEL: @test_loadiwkey + //CHECK: @llvm.x86.loadiwkey + _mm_loadiwkey(ctl, intkey, enkey_lo, enkey_hi); +} + +unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) { + //CHECK-LABEL: @test_encodekey128_u32 + //CHECK: call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %{{.*}}, <2 x i64> %{{.*}}) + return _mm_encodekey128_u32(htype, key, h); +} + +unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i key_hi, void *h) { + //CHECK-LABEL: @test_encodekey256_u32 + //CHECK: call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + return _mm_encodekey256_u32(htype, key_lo, key_hi, h); +} + +unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *h) { + //CHECK-LABEL: @test_mm_aesenc256kl_u8 + //CHECK: call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %{{.*}}, i8* %{{.*}}) + return _mm_aesenc256kl_u8(odata, idata, h); +} + +unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *h) { + //CHECK-LABEL: @test_mm_aesdec256kl_u8 + //CHECK: call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %{{.*}}, i8* %{{.*}}) + return _mm_aesdec256kl_u8(odata, idata, h); +} + +unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *h) { + //CHECK-LABEL: @test_mm_aesenc128kl_u8 + //CHECK: call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %{{.*}}, i8* %{{.*}}) + return _mm_aesenc128kl_u8(odata, idata, h); +} + +unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *h) { + //CHECK-LABEL: @test_mm_aesdec128kl_u8 + //CHECK: call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %{{.*}}, i8* %{{.*}}) + return _mm_aesdec128kl_u8(odata, idata, h); +} + +unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) { + //CHECK-LABEL: @test__mm_aesencwide128kl + //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + return _mm_aesencwide128kl_u8(odata, idata, h); +} + +unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) { + //CHECK-LABEL: @test__mm_aesdecwide128kl + //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + return _mm_aesdecwide128kl_u8(odata, idata, h); +} + +unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) { + //CHECK-LABEL: @test__mm_aesencwide256kl + //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + return _mm_aesencwide256kl_u8(odata, idata, h); +} + +unsigned char test__mm_aesdecwide256kl_u8(__m128i odata[8], const __m128i idata[8], const void* h) { + //CHECK-LABEL: @test__mm_aesdecwide256kl + //CHECK: call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}) + return _mm_aesdecwide256kl_u8(odata, idata, h); +} diff --git a/clang/test/CodeGen/attr-target-x86.c b/clang/test/CodeGen/attr-target-x86.c index 304e5b78d34667..738b65b1113104 100644 --- a/clang/test/CodeGen/attr-target-x86.c +++ b/clang/test/CodeGen/attr-target-x86.c @@ -50,7 +50,7 @@ int __attribute__((target("tune=sandybridge"))) walrus(int a) { return 4; } // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686" // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" // CHECK-NOT: tune-cpu -// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" +// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686" // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686" // CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686" // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 85a9374ab90577..9de728c19c7cab 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -254,6 +254,16 @@ // TSXLDTRK: "-target-feature" "+tsxldtrk" // NO-TSXLDTRK: "-target-feature" "-tsxldtrk" +// RUN: %clang -target i386-linux-gnu -mkl %s -### -o %t.o 2>&1 | FileCheck -check-prefix=KL %s +// RUN: %clang -target i386-linux-gnu -mno-kl %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-KL %s +// KL: "-target-feature" "+kl" +// NO-KL: "-target-feature" "-kl" + +// RUN: %clang -target i386-linux-gnu -mwidekl %s -### -o %t.o 2>&1 | FileCheck -check-prefix=WIDE_KL %s +// RUN: %clang -target i386-linux-gnu -mno-widekl %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-WIDE_KL %s +// WIDE_KL: "-target-feature" "+widekl" +// NO-WIDE_KL: "-target-feature" "-widekl" + // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mamx-tile %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AMX-TILE %s // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-amx-tile %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AMX-TILE %s // AMX-TILE: "-target-feature" "+amx-tile" diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 4a46a131afa74d..59bc9d6ab531ea 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -486,6 +486,25 @@ // NOVP2INTERSECT-NOT: #define __AVX512VP2INTERSECT__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mkl -x c -E -dM -o - %s | FileCheck -check-prefix=KEYLOCKER %s +// KEYLOCKER: #define __KL__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-kl -x c -E -dM -o - %s | FileCheck -check-prefix=NOKEYLOCKER %s +// NOKEYLOCKER-NOT: #define __KL__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mwidekl -x c -E -dM -o - %s | FileCheck -check-prefix=KEYLOCKERW %s +// KEYLOCKERW: #define __KL__ 1 +// KEYLOCKERW: #define __WIDEKL__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-widekl -x c -E -dM -o - %s | FileCheck -check-prefix=NOKEYLOCKERW %s +// NOKEYLOCKERW-NOT: #define __KL__ 1 +// NOKEYLOCKERW-NOT: #define __WIDEKL__ 1 + +// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mwidekl -mno-kl -x c -E -dM -o - %s | FileCheck -check-prefix=NOKEYLOCKERW2 %s +// NOKEYLOCKERW2-NOT: #define __KL__ 1 +// NOKEYLOCKERW2-NOT: #define __WIDEKL__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -menqcmd -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=ENQCMD %s // ENQCMD: #define __ENQCMD__ 1 diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 51ecb978856436..5708a761919f57 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -4948,6 +4948,59 @@ let TargetPrefix = "x86" in { def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">, Intrinsic<[], [], []>; } + +//===----------------------------------------------------------------------===// +// Key Locker +let TargetPrefix = "x86" in { + def int_x86_loadiwkey : GCCBuiltin<"__builtin_ia32_loadiwkey">, + Intrinsic<[], [llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + []>; + def int_x86_encodekey128 : + Intrinsic<[llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [llvm_i32_ty, llvm_v2i64_ty], []>; + def int_x86_encodekey256 : + Intrinsic<[llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; + def int_x86_aesenc128kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; + def int_x86_aesdec128kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; + def int_x86_aesenc256kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; + def int_x86_aesdec256kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; + def int_x86_aesencwide128kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; + def int_x86_aesdecwide128kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; + def int_x86_aesencwide256kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; + def int_x86_aesdecwide256kl : + Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], + [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, + llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; +} + //===----------------------------------------------------------------------===// // AMX - Intel AMX extensions diff --git a/llvm/include/llvm/Support/X86TargetParser.def b/llvm/include/llvm/Support/X86TargetParser.def index e3998c99a50a64..2a803ca7a68916 100644 --- a/llvm/include/llvm/Support/X86TargetParser.def +++ b/llvm/include/llvm/Support/X86TargetParser.def @@ -154,6 +154,8 @@ X86_FEATURE (F16C, "f16c") X86_FEATURE (FSGSBASE, "fsgsbase") X86_FEATURE (FXSR, "fxsr") X86_FEATURE (INVPCID, "invpcid") +X86_FEATURE (KL, "kl") +X86_FEATURE (WIDEKL, "widekl") X86_FEATURE (LWP, "lwp") X86_FEATURE (LZCNT, "lzcnt") X86_FEATURE (MOVBE, "movbe") diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 16cf1bd8a117d4..8d741c3125a84e 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -833,7 +833,8 @@ enum IIT_Info { IIT_SUBDIVIDE4_ARG = 45, IIT_VEC_OF_BITCASTS_TO_INT = 46, IIT_V128 = 47, - IIT_BF16 = 48 + IIT_BF16 = 48, + IIT_STRUCT9 = 49 }; static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, @@ -995,6 +996,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, case IIT_EMPTYSTRUCT: OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0)); return; + case IIT_STRUCT9: ++StructElts; LLVM_FALLTHROUGH; case IIT_STRUCT8: ++StructElts; LLVM_FALLTHROUGH; case IIT_STRUCT7: ++StructElts; LLVM_FALLTHROUGH; case IIT_STRUCT6: ++StructElts; LLVM_FALLTHROUGH; diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp index 26534580d02d37..0f674bbcdc1bb8 100644 --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1469,6 +1469,7 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["avx512bitalg"] = HasLeaf7 && ((ECX >> 12) & 1) && HasAVX512Save; Features["avx512vpopcntdq"] = HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save; Features["rdpid"] = HasLeaf7 && ((ECX >> 22) & 1); + Features["kl"] = HasLeaf7 && ((ECX >> 23) & 1); // key locker Features["cldemote"] = HasLeaf7 && ((ECX >> 25) & 1); Features["movdiri"] = HasLeaf7 && ((ECX >> 27) & 1); Features["movdir64b"] = HasLeaf7 && ((ECX >> 28) & 1); @@ -1509,6 +1510,10 @@ bool sys::getHostCPUFeatures(StringMap &Features) { Features["ptwrite"] = HasLeaf14 && ((EBX >> 4) & 1); + bool HasLeaf19 = + MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX); + Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1); + return true; } #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__)) diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp index b7d9bd4f865c90..99836b8460def2 100644 --- a/llvm/lib/Support/X86TargetParser.cpp +++ b/llvm/lib/Support/X86TargetParser.cpp @@ -194,7 +194,7 @@ static constexpr FeatureBitset FeaturesICLServer = FeaturesICLClient | FeaturePCONFIG | FeatureWBNOINVD; static constexpr FeatureBitset FeaturesTigerlake = FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B | - FeatureMOVDIRI | FeatureSHSTK; + FeatureMOVDIRI | FeatureSHSTK | FeatureKL | FeatureWIDEKL; static constexpr FeatureBitset FeaturesSapphireRapids = FeaturesICLServer | FeatureAMX_TILE | FeatureAMX_INT8 | FeatureAMX_BF16 | FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE | FeatureENQCMD | @@ -538,6 +538,10 @@ static constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {}; static constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE; static constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE; +// Key Locker Features +static constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2; +static constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL; + static constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = { #define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM}, #include "llvm/Support/X86TargetParser.def" diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index f2651d658d71ce..e5d47a0ac3255b 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -279,6 +279,12 @@ def FeatureWAITPKG : SubtargetFeature<"waitpkg", "HasWAITPKG", "true", "Wait and pause enhancements">; def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true", "Has ENQCMD instructions">; +def FeatureKL : SubtargetFeature<"kl", "HasKL", "true", + "Support Key Locker kl Instructions", + [FeatureSSE2]>; +def FeatureWIDEKL : SubtargetFeature<"widekl", "HasWIDEKL", "true", + "Support Key Locker wide Instructions", + [FeatureKL]>; def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true", "Has serialize instruction">; def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true", diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4b3adc7dcfbc9b..d0fd1046fdeb7f 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -25966,6 +25966,185 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, Op->getOperand(3), Op->getOperand(4)}); return Chain; } + case Intrinsic::x86_encodekey128: + case Intrinsic::x86_encodekey256: { + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other, MVT::Glue); + SDValue Chain = Op.getOperand(0); + bool IsEK256 = false; + Chain = DAG.getCopyToReg(Chain, DL, X86::XMM0, Op->getOperand(3), + SDValue()); + + unsigned Opcode; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_encodekey128: + Opcode = X86::ENCODEKEY128; + break; + case Intrinsic::x86_encodekey256: + Opcode = X86::ENCODEKEY256; + Chain = DAG.getCopyToReg(Chain, DL, X86::XMM1, Op->getOperand(4), + Chain.getValue(1)); + IsEK256 = true; + break; + } + + SDNode *Res = DAG.getMachineNode(Opcode, DL, VTs, + {Op.getOperand(2), Chain, + Chain.getValue(1)}); + + Chain = SDValue(Res, 1); + + SDValue XMM0 = DAG.getCopyFromReg(Chain, DL, X86::XMM0, MVT::v16i8, + SDValue(Res, 2)); + SDValue XMM1 = DAG.getCopyFromReg(XMM0.getValue(1), DL, X86::XMM1, + MVT::v16i8, XMM0.getValue(2)); + SDValue XMM2 = DAG.getCopyFromReg(XMM1.getValue(1), DL, X86::XMM2, + MVT::v16i8, XMM1.getValue(2)); + SDValue XMM3, XMM4; + if (IsEK256) { + XMM3 = DAG.getCopyFromReg(XMM2.getValue(1), DL, X86::XMM3, + MVT::v16i8, XMM2.getValue(2)); + XMM4 = DAG.getCopyFromReg(XMM3.getValue(1), DL, X86::XMM4, + MVT::v16i8, XMM3.getValue(2)); + } else { + XMM4 = DAG.getCopyFromReg(XMM2.getValue(1), DL, X86::XMM4, + MVT::v16i8, XMM2.getValue(2)); + } + SDValue XMM5 = DAG.getCopyFromReg(XMM4.getValue(1), DL, X86::XMM5, + MVT::v16i8, XMM4.getValue(2)); + SDValue XMM6 = DAG.getCopyFromReg(XMM5.getValue(1), DL, X86::XMM6, + MVT::v16i8, XMM5.getValue(2)); + + if (IsEK256) { + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + {SDValue(Res, 0), + XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, Chain}); + } else { + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + {SDValue(Res, 0), + XMM0, XMM1, XMM2, XMM4, XMM5, XMM6, Chain}); + } + } + case Intrinsic::x86_aesenc128kl: + case Intrinsic::x86_aesdec128kl: + case Intrinsic::x86_aesenc256kl: + case Intrinsic::x86_aesdec256kl: { + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(MVT::v16i8, MVT::Other, MVT::Glue); + SDValue Chain = Op.getOperand(0); + unsigned Opcode; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_aesenc128kl: + Opcode = X86::AESENC128KL; + break; + case Intrinsic::x86_aesdec128kl: + Opcode = X86::AESDEC128KL; + break; + case Intrinsic::x86_aesenc256kl: + Opcode = X86::AESENC256KL; + break; + case Intrinsic::x86_aesdec256kl: + Opcode = X86::AESDEC256KL; + break; + } + + SDValue XMM = Op.getOperand(2); + SDValue Base = Op.getOperand(3); + SDValue Index = DAG.getRegister(0, MVT::i32); + SDValue Scale = DAG.getTargetConstant(1, DL, MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + + SDNode *Res = DAG.getMachineNode(Opcode, DL, VTs, {XMM, Base, Scale, Index, + Disp, Segment, Chain}); + Chain = SDValue(Res, 1); + SDValue EFLAGS = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, + SDValue(Res, 2)); + SDValue ZF = getSETCC(X86::COND_E, EFLAGS.getValue(0), DL, DAG); + + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + {ZF, SDValue(Res, 0), EFLAGS.getValue(1)}); + } + case Intrinsic::x86_aesencwide128kl: + case Intrinsic::x86_aesdecwide128kl: + case Intrinsic::x86_aesencwide256kl: + case Intrinsic::x86_aesdecwide256kl: { + SDLoc DL(Op); + SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Chain = Op.getOperand(0); + unsigned Opcode; + + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); + case Intrinsic::x86_aesencwide128kl: + Opcode = X86::AESENCWIDE128KL; + break; + case Intrinsic::x86_aesdecwide128kl: + Opcode = X86::AESDECWIDE128KL; + break; + case Intrinsic::x86_aesencwide256kl: + Opcode = X86::AESENCWIDE256KL; + break; + case Intrinsic::x86_aesdecwide256kl: + Opcode = X86::AESDECWIDE256KL; + break; + } + + SDValue Base = Op.getOperand(2); + SDValue Index = DAG.getRegister(0, MVT::i32); + SDValue Scale = DAG.getTargetConstant(1, DL, MVT::i8); + SDValue Disp = DAG.getTargetConstant(0, DL, MVT::i32); + SDValue Segment = DAG.getRegister(0, MVT::i32); + + Chain = DAG.getCopyToReg(Chain, DL, X86::XMM0, Op->getOperand(3), + SDValue()); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM1, + Op->getOperand(4), Chain.getValue(1)); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM2, + Op->getOperand(5), Chain.getValue(1)); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM3, + Op->getOperand(6), Chain.getValue(1)); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM4, + Op->getOperand(7), Chain.getValue(1)); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM5, + Op->getOperand(8), Chain.getValue(1)); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM6, + Op->getOperand(9), Chain.getValue(1)); + Chain = DAG.getCopyToReg(Chain.getValue(0), DL, X86::XMM7, + Op->getOperand(10),Chain.getValue(1)); + + SDNode *Res = DAG.getMachineNode(Opcode, DL, VTs, {Base, Scale, Index, + Disp, Segment, Chain, + Chain.getValue(1)}); + + Chain = SDValue(Res, 0); + SDValue EFLAGS = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, + SDValue(Res, 1)); + SDValue ZF = getSETCC(X86::COND_E, EFLAGS.getValue(0), DL, DAG); + SDValue XMM0 = DAG.getCopyFromReg(EFLAGS.getValue(1), DL, X86::XMM0, + MVT::v16i8, EFLAGS.getValue(2)); + SDValue XMM1 = DAG.getCopyFromReg(XMM0.getValue(1), DL, X86::XMM1, + MVT::v16i8, XMM0.getValue(2)); + SDValue XMM2 = DAG.getCopyFromReg(XMM1.getValue(1), DL, X86::XMM2, + MVT::v16i8, XMM1.getValue(2)); + SDValue XMM3 = DAG.getCopyFromReg(XMM2.getValue(1), DL, X86::XMM3, + MVT::v16i8, XMM2.getValue(2)); + SDValue XMM4 = DAG.getCopyFromReg(XMM3.getValue(1), DL, X86::XMM4, + MVT::v16i8, XMM3.getValue(2)); + SDValue XMM5 = DAG.getCopyFromReg(XMM4.getValue(1), DL, X86::XMM5, + MVT::v16i8, XMM4.getValue(2)); + SDValue XMM6 = DAG.getCopyFromReg(XMM5.getValue(1), DL, X86::XMM6, + MVT::v16i8, XMM5.getValue(2)); + SDValue XMM7 = DAG.getCopyFromReg(XMM6.getValue(1), DL, X86::XMM7, + MVT::v16i8, XMM6.getValue(2)); + return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), + {ZF, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, + XMM7.getValue(1)}); + } } return SDValue(); } diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 99a9ce2fc7e61f..d13ba5dbc0eb0e 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -971,6 +971,8 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def HasPCONFIG : Predicate<"Subtarget->hasPCONFIG()">; def HasENQCMD : Predicate<"Subtarget->hasENQCMD()">; +def HasKL : Predicate<"Subtarget->hasKL()">; +def HasWIDEKL : Predicate<"Subtarget->hasWIDEKL()">; def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">; def HasTSXLDTRK : Predicate<"Subtarget->hasTSXLDTRK()">; def HasAMXTILE : Predicate<"Subtarget->hasAMXTILE()">; @@ -3094,6 +3096,9 @@ include "X86InstrSGX.td" include "X86InstrTDX.td" +// Key Locker instructions +include "X86InstrKL.td" + // AMX instructions include "X86InstrAMX.td" diff --git a/llvm/lib/Target/X86/X86InstrInfo.td.rej b/llvm/lib/Target/X86/X86InstrInfo.td.rej new file mode 100644 index 00000000000000..5c0a632b55a70a --- /dev/null +++ b/llvm/lib/Target/X86/X86InstrInfo.td.rej @@ -0,0 +1,11 @@ +diff a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td (rejected hunks) +@@ -3092,6 +3094,9 @@ include "X86InstrSVM.td" + include "X86InstrTSX.td" + include "X86InstrSGX.td" + ++// Key Locker instructions ++include "X86InstrKL.td" ++ + // AMX instructions + include "X86InstrAMX.td" + diff --git a/llvm/lib/Target/X86/X86InstrKL.td b/llvm/lib/Target/X86/X86InstrKL.td new file mode 100644 index 00000000000000..452410891bd866 --- /dev/null +++ b/llvm/lib/Target/X86/X86InstrKL.td @@ -0,0 +1,66 @@ +//===---------------------------*-tablegen-*-------------------------------===// +//===------------- X86InstrKL.td - KL Instruction Set Extension -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the instructions that make up the Intel key locker +// instruction set. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Key Locker instructions + +let SchedRW = [WriteSystem], Predicates = [HasKL] in { + let Uses = [XMM0, EAX] in { + def LOADIWKEY : I<0xDC, MRMSrcReg, (outs), (ins VR128X:$src1, VR128X:$src2), + "loadiwkey\t{$src2, $src1|$src1, $src2}", + [(int_x86_loadiwkey EAX, XMM0, VR128X:$src1, VR128X:$src2)]>, T8XS; + } + + let Uses = [XMM0], Defs = [XMM0, XMM1, XMM2, XMM4, XMM5, XMM6] in { + def ENCODEKEY128 : I<0xFA, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "encodekey128\t{$src, $dst|$dst, $src}", []>, T8XS; + } + + let Uses = [XMM0, XMM1], Defs = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6] in { + def ENCODEKEY256 : I<0xFB, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), + "encodekey256\t{$src, $dst|$dst, $src}", []>, T8XS; + } + + let Constraints = "$src1 = $dst", + Defs = [EFLAGS] in { + def AESENC128KL : I<0xDC, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, opaquemem:$src2), + "aesenc128kl\t{$src2, $src1|$src1, $src2}", []>, T8XS; + + def AESDEC128KL : I<0xDD, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, opaquemem:$src2), + "aesdec128kl\t{$src2, $src1|$src1, $src2}", []>, T8XS; + + def AESENC256KL : I<0xDE, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, opaquemem:$src2), + "aesenc256kl\t{$src2, $src1|$src1, $src2}", []>, T8XS; + + def AESDEC256KL : I<0xDF, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, opaquemem:$src2), + "aesdec256kl\t{$src2, $src1|$src1, $src2}", []>, T8XS; + } + +} // SchedRW, Predicates + +let SchedRW = [WriteSystem], Predicates = [HasWIDEKL] in { + let Uses = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7], + Defs = [EFLAGS, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7] in { + def AESENCWIDE128KL : I<0xD8, MRM0m, (outs), (ins opaquemem:$src), + "aesencwide128kl\t$src", []>, T8XS; + def AESDECWIDE128KL : I<0xD8, MRM1m, (outs), (ins opaquemem:$src), + "aesdecwide128kl\t$src", []>, T8XS; + def AESENCWIDE256KL : I<0xD8, MRM2m, (outs), (ins opaquemem:$src), + "aesencwide256kl\t$src", []>, T8XS; + def AESDECWIDE256KL : I<0xD8, MRM3m, (outs), (ins opaquemem:$src), + "aesdecwide256kl\t$src", []>, T8XS; + } + +} // SchedRW, Predicates diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 923f8105870fcc..263be40639db85 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -395,6 +395,12 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// Processor supports PCONFIG instruction bool HasPCONFIG = false; + /// Processor support key locker instructions + bool HasKL = false; + + /// Processor support key locker wide instructions + bool HasWIDEKL = false; + /// Processor supports SERIALIZE instruction bool HasSERIALIZE = false; @@ -728,6 +734,8 @@ class X86Subtarget final : public X86GenSubtargetInfo { bool hasSGX() const { return HasSGX; } bool hasINVPCID() const { return HasINVPCID; } bool hasENQCMD() const { return HasENQCMD; } + bool hasKL() const { return HasKL; } + bool hasWIDEKL() const { return HasWIDEKL; } bool hasSERIALIZE() const { return HasSERIALIZE; } bool hasTSXLDTRK() const { return HasTSXLDTRK; } bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; } diff --git a/llvm/test/CodeGen/X86/keylocker-intrinsics.ll b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll new file mode 100644 index 00000000000000..472eed484a16e2 --- /dev/null +++ b/llvm/test/CodeGen/X86/keylocker-intrinsics.ll @@ -0,0 +1,312 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unkown-unknown -mattr=+kl,widekl | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unkown-unknown -mattr=+kl,widekl -mattr=+avx2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unkown-unknown -mattr=+widekl | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i386-unkown-unknown -mattr=+widekl -mattr=+avx2 | FileCheck %s --check-prefix=X32 + +declare void @llvm.x86.loadiwkey(i32, <2 x i64>, <2 x i64>, <2 x i64>) +declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32, <2 x i64>) +declare { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32, <2 x i64>, <2 x i64>) +declare { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64>, i8*) +declare { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64>, i8*) +declare { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64>, i8*) +declare { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64>, i8*) +declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>) +declare { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>) + +define void @test_loadiwkey(i32 %ctl, <2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi) { +; X64-LABEL: test_loadiwkey: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %edi, %eax +; X64-NEXT: loadiwkey %xmm2, %xmm1 +; X64-NEXT: retq +; +; X32-LABEL: test_loadiwkey: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: loadiwkey %xmm2, %xmm1 +; X32-NEXT: retl +entry: + tail call void @llvm.x86.loadiwkey(i32 %ctl, <2 x i64> %intkey, <2 x i64> %enkey_lo, <2 x i64> %enkey_hi) + ret void +} + +define i32 @test_encodekey128_u32(i32 %htype, <2 x i64> %key, <2 x i64>* nocapture %h0, <2 x i64>* nocapture %h1, <2 x i64>* nocapture %h2, <2 x i64>* nocapture %h3, <2 x i64>* nocapture %h4, <2 x i64>* nocapture %h5) { +; X64-LABEL: test_encodekey128_u32: +; X64: # %bb.0: # %entry +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: encodekey128 %edi, %eax +; X64-NEXT: movaps %xmm0, (%rsi) +; X64-NEXT: movaps %xmm1, (%rdx) +; X64-NEXT: movaps %xmm2, (%rcx) +; X64-NEXT: movaps %xmm4, (%r8) +; X64-NEXT: movaps %xmm5, (%r9) +; X64-NEXT: movaps %xmm6, (%r10) +; X64-NEXT: retq +; +; X32-LABEL: test_encodekey128_u32: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -20 +; X32-NEXT: .cfi_offset %edi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: encodekey128 %eax, %eax +; X32-NEXT: vmovaps %xmm0, (%ebp) +; X32-NEXT: vmovaps %xmm1, (%ebx) +; X32-NEXT: vmovaps %xmm2, (%edi) +; X32-NEXT: vmovaps %xmm4, (%esi) +; X32-NEXT: vmovaps %xmm5, (%edx) +; X32-NEXT: vmovaps %xmm6, (%ecx) +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +entry: + %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 %htype, <2 x i64> %key) + %1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1 + store <2 x i64> %1, <2 x i64>* %h0, align 16 + %2 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2 + store <2 x i64> %2, <2 x i64>* %h1, align 16 + %3 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3 + store <2 x i64> %3, <2 x i64>* %h2, align 16 + %4 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4 + store <2 x i64> %4, <2 x i64>* %h3, align 16 + %5 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5 + store <2 x i64> %5, <2 x i64>* %h4, align 16 + %6 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6 + store <2 x i64> %6, <2 x i64>* %h5, align 16 + %7 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0 + ret i32 %7 +} + +define i32 @test_encodekey256_u32(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi, <2 x i64>* nocapture %h0, <2 x i64>* nocapture %h1, <2 x i64>* nocapture %h2, <2 x i64>* nocapture %h3, <2 x i64>* nocapture %h4, <2 x i64>* nocapture %h5, <2 x i64>* nocapture readnone %h6) { +; X64-LABEL: test_encodekey256_u32: +; X64: # %bb.0: # %entry +; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; X64-NEXT: encodekey256 %edi, %eax +; X64-NEXT: movaps %xmm0, (%rsi) +; X64-NEXT: movaps %xmm1, (%rdx) +; X64-NEXT: movaps %xmm2, (%rcx) +; X64-NEXT: movaps %xmm3, (%r8) +; X64-NEXT: movaps %xmm4, (%r9) +; X64-NEXT: movaps %xmm5, (%r10) +; X64-NEXT: retq +; +; X32-LABEL: test_encodekey256_u32: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: pushl %ebx +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: pushl %edi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: pushl %esi +; X32-NEXT: .cfi_def_cfa_offset 20 +; X32-NEXT: .cfi_offset %esi, -20 +; X32-NEXT: .cfi_offset %edi, -16 +; X32-NEXT: .cfi_offset %ebx, -12 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X32-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: encodekey256 %eax, %eax +; X32-NEXT: vmovaps %xmm0, (%ebp) +; X32-NEXT: vmovaps %xmm1, (%ebx) +; X32-NEXT: vmovaps %xmm2, (%edi) +; X32-NEXT: vmovaps %xmm3, (%esi) +; X32-NEXT: vmovaps %xmm4, (%edx) +; X32-NEXT: vmovaps %xmm5, (%ecx) +; X32-NEXT: popl %esi +; X32-NEXT: .cfi_def_cfa_offset 16 +; X32-NEXT: popl %edi +; X32-NEXT: .cfi_def_cfa_offset 12 +; X32-NEXT: popl %ebx +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa_offset 4 +; X32-NEXT: retl +entry: + %0 = tail call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 %htype, <2 x i64> %key_lo, <2 x i64> %key_hi) + %1 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 1 + store <2 x i64> %1, <2 x i64>* %h0, align 16 + %2 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 2 + store <2 x i64> %2, <2 x i64>* %h1, align 16 + %3 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 3 + store <2 x i64> %3, <2 x i64>* %h2, align 16 + %4 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 4 + store <2 x i64> %4, <2 x i64>* %h3, align 16 + %5 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 5 + store <2 x i64> %5, <2 x i64>* %h4, align 16 + %6 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 6 + store <2 x i64> %6, <2 x i64>* %h5, align 16 + %7 = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0 + ret i32 %7 +} + +define i8 @test_mm_aesenc128kl_u8(<2 x i64> %data, i8* %h) { +; X64-LABEL: test_mm_aesenc128kl_u8: +; X64: # %bb.0: # %entry +; X64-NEXT: aesenc128kl (%rdi), %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X32-LABEL: test_mm_aesenc128kl_u8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: aesenc128kl (%eax), %xmm0 +; X32-NEXT: sete %al +; X32-NEXT: retl +entry: + %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> %data, i8* %h) + %1 = extractvalue { i8, <2 x i64> } %0, 0 + ret i8 %1 +} + +define i8 @test_mm_aesdec128kl_u8(<2 x i64> %data, i8* %h) { +; X64-LABEL: test_mm_aesdec128kl_u8: +; X64: # %bb.0: # %entry +; X64-NEXT: aesdec128kl (%rdi), %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X32-LABEL: test_mm_aesdec128kl_u8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: aesdec128kl (%eax), %xmm0 +; X32-NEXT: sete %al +; X32-NEXT: retl +entry: + %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> %data, i8* %h) + %1 = extractvalue { i8, <2 x i64> } %0, 0 + ret i8 %1 +} + +define i8 @test_mm_aesenc256kl_u8(<2 x i64> %data, i8* %h) { +; X64-LABEL: test_mm_aesenc256kl_u8: +; X64: # %bb.0: # %entry +; X64-NEXT: aesenc256kl (%rdi), %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X32-LABEL: test_mm_aesenc256kl_u8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: aesenc256kl (%eax), %xmm0 +; X32-NEXT: sete %al +; X32-NEXT: retl +entry: + %0 = tail call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> %data, i8* %h) + %1 = extractvalue { i8, <2 x i64> } %0, 0 + ret i8 %1 +} + +define i8 @test_mm_aesdec256kl_u8(<2 x i64> %data, i8* %h) { +; X64-LABEL: test_mm_aesdec256kl_u8: +; X64: # %bb.0: # %entry +; X64-NEXT: aesdec256kl (%rdi), %xmm0 +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X32-LABEL: test_mm_aesdec256kl_u8: +; X32: # %bb.0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: aesdec256kl (%eax), %xmm0 +; X32-NEXT: sete %al +; X32-NEXT: retl +entry: + %0 = tail call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> %data, i8* %h) + %1 = extractvalue { i8, <2 x i64> } %0, 0 + ret i8 %1 +} + +define i8 @test_mm_aesencwide128kl_u8(i8* %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7) { +; X64-LABEL: test_mm_aesencwide128kl_u8: +; X64: # %bb.0: # %entry +; X64-NEXT: aesencwide128kl (%rdi) +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X32-LABEL: test_mm_aesencwide128kl_u8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $16, %esp +; X32-NEXT: vmovaps 24(%ebp), %xmm3 +; X32-NEXT: vmovaps 40(%ebp), %xmm4 +; X32-NEXT: vmovaps 56(%ebp), %xmm5 +; X32-NEXT: vmovaps 72(%ebp), %xmm6 +; X32-NEXT: vmovaps 88(%ebp), %xmm7 +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: aesencwide128kl (%eax) +; X32-NEXT: sete %al +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl +entry: + %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7) + %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0 + ret i8 %1 +} + +define i8 @test_mm_aesencwide256kl_u8(i8* %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7) { +; X64-LABEL: test_mm_aesencwide256kl_u8: +; X64: # %bb.0: # %entry +; X64-NEXT: aesencwide256kl (%rdi) +; X64-NEXT: sete %al +; X64-NEXT: retq +; +; X32-LABEL: test_mm_aesencwide256kl_u8: +; X32: # %bb.0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: .cfi_def_cfa_offset 8 +; X32-NEXT: .cfi_offset %ebp, -8 +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: .cfi_def_cfa_register %ebp +; X32-NEXT: andl $-16, %esp +; X32-NEXT: subl $16, %esp +; X32-NEXT: vmovaps 24(%ebp), %xmm3 +; X32-NEXT: vmovaps 40(%ebp), %xmm4 +; X32-NEXT: vmovaps 56(%ebp), %xmm5 +; X32-NEXT: vmovaps 72(%ebp), %xmm6 +; X32-NEXT: vmovaps 88(%ebp), %xmm7 +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: aesencwide256kl (%eax) +; X32-NEXT: sete %al +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: .cfi_def_cfa %esp, 4 +; X32-NEXT: retl +entry: + %0 = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* %p, <2 x i64> %v0, <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3, <2 x i64> %v4, <2 x i64> %v5, <2 x i64> %v6, <2 x i64> %v7) + %1 = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %0, 0 + ret i8 %1 +} diff --git a/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-att.txt b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-att.txt new file mode 100644 index 00000000000000..45f2d1164faacb --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-att.txt @@ -0,0 +1,276 @@ +# RUN: llvm-mc --disassemble %s -triple=i686-apple-darwin9 | FileCheck %s +# CHECK: loadiwkey %xmm2, %xmm3 +0xf3 0x0f 0x38 0xdc 0xda + +# CHECK: loadiwkey %xmm2, %xmm6 +0xf3 0x0f 0x38 0xdc 0xf2 + +# CHECK: encodekey128 %eax, %ebx +0xf3 0x0f 0x38 0xfa 0xd8 + +# CHECK: encodekey128 %eax, %edx +0xf3 0x0f 0x38 0xfa 0xd0 + +# CHECK: encodekey256 %eax, %ebx +0xf3 0x0f 0x38 0xfb 0xd8 + +# CHECK: encodekey256 %eax, %edx +0xf3 0x0f 0x38 0xfb 0xd0 + +# CHECK: aesenc128kl 126(%edx), %xmm2 +0xf3 0x0f 0x38 0xdc 0x52 0x7e + +# CHECK: aesdec128kl 126(%edx), %xmm2 +0xf3 0x0f 0x38 0xdd 0x52 0x7e + +# CHECK: aesenc256kl 126(%edx), %xmm2 +0xf3 0x0f 0x38 0xde 0x52 0x7e + +# CHECK: aesdec256kl 126(%edx), %xmm2 +0xf3 0x0f 0x38 0xdf 0x52 0x7e + +# CHECK: aesencwide128kl (%ebx) +0xf3 0x0f 0x38 0xd8 0x03 + +# CHECK: aesencwide128kl 126(%edx) +0xf3 0x0f 0x38 0xd8 0x42 0x7e + +# CHECK: aesdecwide128kl (%ebx) +0xf3 0x0f 0x38 0xd8 0x0b + +# CHECK: aesdecwide128kl 126(%edx) +0xf3 0x0f 0x38 0xd8 0x4a 0x7e + +# CHECK: aesencwide256kl (%ebx) +0xf3 0x0f 0x38 0xd8 0x13 + +# CHECK: aesencwide256kl 126(%edx) +0xf3 0x0f 0x38 0xd8 0x52 0x7e + +# CHECK: aesdecwide256kl (%ebx) +0xf3 0x0f 0x38 0xd8 0x1b + +# CHECK: aesdecwide256kl 126(%edx) +0xf3 0x0f 0x38 0xd8 0x5a 0x7e + +# CHECK: aesdec128kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xdd,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xdd,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xdd,0x10 + +# CHECK: aesdec128kl -1536(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xdd,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl 6096(%ecx), %xmm2 +0xf3,0x0f,0x38,0xdd,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl -6144(%edx), %xmm2 +0xf3,0x0f,0x38,0xdd,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xdf,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xdf,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xdf,0x10 + +# CHECK: aesdec256kl -2048(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xdf,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl 8128(%ecx), %xmm2 +0xf3,0x0f,0x38,0xdf,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl -8192(%edx), %xmm2 +0xf3,0x0f,0x38,0xdf,0x92,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xdc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xdc,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xdc,0x10 + +# CHECK: aesenc128kl -1536(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xdc,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl 6096(%ecx), %xmm2 +0xf3,0x0f,0x38,0xdc,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl -6144(%edx), %xmm2 +0xf3,0x0f,0x38,0xdc,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xde,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xde,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xde,0x10 + +# CHECK: aesenc256kl -2048(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xde,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl 8128(%ecx), %xmm2 +0xf3,0x0f,0x38,0xde,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl -8192(%edx), %xmm2 +0xf3,0x0f,0x38,0xde,0x92,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey %xmm3, %xmm2 +0xf3,0x0f,0x38,0xdc,0xd3 + +# CHECK: aesdec128kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xdd,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xdd,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xdd,0x10 + +# CHECK: aesdec128kl -1536(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xdd,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl 6096(%ecx), %xmm2 +0xf3,0x0f,0x38,0xdd,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl -6144(%edx), %xmm2 +0xf3,0x0f,0x38,0xdd,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xdf,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xdf,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xdf,0x10 + +# CHECK: aesdec256kl -2048(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xdf,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl 8128(%ecx), %xmm2 +0xf3,0x0f,0x38,0xdf,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl -8192(%edx), %xmm2 +0xf3,0x0f,0x38,0xdf,0x92,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xdc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xdc,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xdc,0x10 + +# CHECK: aesenc128kl -1536(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xdc,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl 6096(%ecx), %xmm2 +0xf3,0x0f,0x38,0xdc,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl -6144(%edx), %xmm2 +0xf3,0x0f,0x38,0xdc,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl 268435456(%esp,%esi,8), %xmm2 +0xf3,0x0f,0x38,0xde,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl 291(%edi,%eax,4), %xmm2 +0xf3,0x0f,0x38,0xde,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl (%eax), %xmm2 +0xf3,0x0f,0x38,0xde,0x10 + +# CHECK: aesenc256kl -2048(,%ebp,2), %xmm2 +0xf3,0x0f,0x38,0xde,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl 8128(%ecx), %xmm2 +0xf3,0x0f,0x38,0xde,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl -8192(%edx), %xmm2 +0xf3,0x0f,0x38,0xde,0x92,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey %xmm3, %xmm2 +0xf3,0x0f,0x38,0xdc,0xd3 + +# CHECK: aesdecwide128kl 268435456(%esp,%esi,8) +0xf3,0x0f,0x38,0xd8,0x8c,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide128kl 291(%edi,%eax,4) +0xf3,0x0f,0x38,0xd8,0x8c,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide128kl (%eax) +0xf3,0x0f,0x38,0xd8,0x08 + +# CHECK: aesdecwide128kl -1536(,%ebp,2) +0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdecwide128kl 6096(%ecx) +0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00 + +# CHECK: aesdecwide128kl -6144(%edx) +0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff + +# CHECK: aesdecwide256kl 268435456(%esp,%esi,8) +0xf3,0x0f,0x38,0xd8,0x9c,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide256kl 291(%edi,%eax,4) +0xf3,0x0f,0x38,0xd8,0x9c,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide256kl (%eax) +0xf3,0x0f,0x38,0xd8,0x18 + +# CHECK: aesdecwide256kl -2048(,%ebp,2) +0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdecwide256kl 8128(%ecx) +0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdecwide256kl -8192(%edx) +0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff + +# CHECK: aesencwide128kl 268435456(%esp,%esi,8) +0xf3,0x0f,0x38,0xd8,0x84,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide128kl 291(%edi,%eax,4) +0xf3,0x0f,0x38,0xd8,0x84,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide128kl (%eax) +0xf3,0x0f,0x38,0xd8,0x00 + +# CHECK: aesencwide128kl -1536(,%ebp,2) +0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesencwide128kl 6096(%ecx) +0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00 + +# CHECK: aesencwide128kl -6144(%edx) +0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff + +# CHECK: aesencwide256kl 268435456(%esp,%esi,8) +0xf3,0x0f,0x38,0xd8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide256kl 291(%edi,%eax,4) +0xf3,0x0f,0x38,0xd8,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide256kl (%eax) +0xf3,0x0f,0x38,0xd8,0x10 + +# CHECK: aesencwide256kl -2048(,%ebp,2) +0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesencwide256kl 8128(%ecx) +0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesencwide256kl -8192(%edx) +0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff diff --git a/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-intel.txt b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-intel.txt new file mode 100644 index 00000000000000..983abeb7806010 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-32-intel.txt @@ -0,0 +1,223 @@ +# RUN: llvm-mc --disassemble %s -triple=i386 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s + +# CHECK: aesdec128kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xdd,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xdd,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl xmm2, [eax] +0xf3,0x0f,0x38,0xdd,0x10 + +# CHECK: aesdec128kl xmm2, [2*ebp - 1536] +0xf3,0x0f,0x38,0xdd,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl xmm2, [ecx + 6096] +0xf3,0x0f,0x38,0xdd,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl xmm2, [edx - 6144] +0xf3,0x0f,0x38,0xdd,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xdf,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xdf,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl xmm2, [eax] +0xf3,0x0f,0x38,0xdf,0x10 + +# CHECK: aesdec256kl xmm2, [2*ebp - 2048] +0xf3,0x0f,0x38,0xdf,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl xmm2, [ecx + 8128] +0xf3,0x0f,0x38,0xdf,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl xmm2, [edx - 8192] +0xf3,0x0f,0x38,0xdf,0x92,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xdc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xdc,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl xmm2, [eax] +0xf3,0x0f,0x38,0xdc,0x10 + +# CHECK: aesenc128kl xmm2, [2*ebp - 1536] +0xf3,0x0f,0x38,0xdc,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl xmm2, [ecx + 6096] +0xf3,0x0f,0x38,0xdc,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl xmm2, [edx - 6144] +0xf3,0x0f,0x38,0xdc,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xde,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xde,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl xmm2, [eax] +0xf3,0x0f,0x38,0xde,0x10 + +# CHECK: aesenc256kl xmm2, [2*ebp - 2048] +0xf3,0x0f,0x38,0xde,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl xmm2, [ecx + 8128] +0xf3,0x0f,0x38,0xde,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl xmm2, [edx - 8192] +0xf3,0x0f,0x38,0xde,0x92,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey xmm2, xmm3 +0xf3,0x0f,0x38,0xdc,0xd3 + +# CHECK: aesdec128kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xdd,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xdd,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl xmm2, [eax] +0xf3,0x0f,0x38,0xdd,0x10 + +# CHECK: aesdec128kl xmm2, [2*ebp - 1536] +0xf3,0x0f,0x38,0xdd,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl xmm2, [ecx + 6096] +0xf3,0x0f,0x38,0xdd,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl xmm2, [edx - 6144] +0xf3,0x0f,0x38,0xdd,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xdf,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xdf,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl xmm2, [eax] +0xf3,0x0f,0x38,0xdf,0x10 + +# CHECK: aesdec256kl xmm2, [2*ebp - 2048] +0xf3,0x0f,0x38,0xdf,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl xmm2, [ecx + 8128] +0xf3,0x0f,0x38,0xdf,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl xmm2, [edx - 8192] +0xf3,0x0f,0x38,0xdf,0x92,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xdc,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xdc,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl xmm2, [eax] +0xf3,0x0f,0x38,0xdc,0x10 + +# CHECK: aesenc128kl xmm2, [2*ebp - 1536] +0xf3,0x0f,0x38,0xdc,0x14,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl xmm2, [ecx + 6096] +0xf3,0x0f,0x38,0xdc,0x91,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl xmm2, [edx - 6144] +0xf3,0x0f,0x38,0xdc,0x92,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl xmm2, [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xde,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl xmm2, [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xde,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl xmm2, [eax] +0xf3,0x0f,0x38,0xde,0x10 + +# CHECK: aesenc256kl xmm2, [2*ebp - 2048] +0xf3,0x0f,0x38,0xde,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl xmm2, [ecx + 8128] +0xf3,0x0f,0x38,0xde,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl xmm2, [edx - 8192] +0xf3,0x0f,0x38,0xde,0x92,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey xmm2, xmm3 +0xf3,0x0f,0x38,0xdc,0xd3 + +# CHECK: aesdecwide128kl [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xd8,0x8c,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide128kl [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xd8,0x8c,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide128kl [eax] +0xf3,0x0f,0x38,0xd8,0x08 + +# CHECK: aesdecwide128kl [2*ebp - 1536] +0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdecwide128kl [ecx + 6096] +0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00 + +# CHECK: aesdecwide128kl [edx - 6144] +0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff + +# CHECK: aesdecwide256kl [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xd8,0x9c,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide256kl [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xd8,0x9c,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide256kl [eax] +0xf3,0x0f,0x38,0xd8,0x18 + +# CHECK: aesdecwide256kl [2*ebp - 2048] +0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdecwide256kl [ecx + 8128] +0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdecwide256kl [edx - 8192] +0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff + +# CHECK: aesencwide128kl [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xd8,0x84,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide128kl [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xd8,0x84,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide128kl [eax] +0xf3,0x0f,0x38,0xd8,0x00 + +# CHECK: aesencwide128kl [2*ebp - 1536] +0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesencwide128kl [ecx + 6096] +0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00 + +# CHECK: aesencwide128kl [edx - 6144] +0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff + +# CHECK: aesencwide256kl [esp + 8*esi + 268435456] +0xf3,0x0f,0x38,0xd8,0x94,0xf4,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide256kl [edi + 4*eax + 291] +0xf3,0x0f,0x38,0xd8,0x94,0x87,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide256kl [eax] +0xf3,0x0f,0x38,0xd8,0x10 + +# CHECK: aesencwide256kl [2*ebp - 2048] +0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesencwide256kl [ecx + 8128] +0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesencwide256kl [edx - 8192] +0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff diff --git a/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-att.txt b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-att.txt new file mode 100644 index 00000000000000..973677d92aa327 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-att.txt @@ -0,0 +1,277 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s +# CHECK: loadiwkey %xmm2, %xmm3 +0xf3 0x0f 0x38 0xdc 0xda + +# CHECK: loadiwkey %xmm2, %xmm6 +0xf3 0x0f 0x38 0xdc 0xf2 + +# CHECK: encodekey128 %eax, %ebx +0xf3 0x0f 0x38 0xfa 0xd8 + +# CHECK: encodekey128 %eax, %edx +0xf3 0x0f 0x38 0xfa 0xd0 + +# CHECK: encodekey256 %eax, %ebx +0xf3 0x0f 0x38 0xfb 0xd8 + +# CHECK: encodekey256 %eax, %edx +0xf3 0x0f 0x38 0xfb 0xd0 + +# CHECK: aesenc128kl 126(%rdx), %xmm2 +0xf3 0x0f 0x38 0xdc 0x52 0x7e + +# CHECK: aesdec128kl 126(%rdx), %xmm2 +0xf3 0x0f 0x38 0xdd 0x52 0x7e + +# CHECK: aesenc256kl 126(%rdx), %xmm2 +0xf3 0x0f 0x38 0xde 0x52 0x7e + +# CHECK: aesdec256kl 126(%rdx), %xmm2 +0xf3 0x0f 0x38 0xdf 0x52 0x7e + +# CHECK: aesencwide128kl (%rbx) +0xf3 0x0f 0x38 0xd8 0x03 + +# CHECK: aesencwide128kl 126(%rdx) +0xf3 0x0f 0x38 0xd8 0x42 0x7e + +# CHECK: aesdecwide128kl (%rbx) +0xf3 0x0f 0x38 0xd8 0x0b + +# CHECK: aesdecwide128kl 126(%rdx) +0xf3 0x0f 0x38 0xd8 0x4a 0x7e + +# CHECK: aesencwide256kl (%rbx) +0xf3 0x0f 0x38 0xd8 0x13 + +# CHECK: aesencwide256kl 126(%rdx) +0xf3 0x0f 0x38 0xd8 0x52 0x7e + +# CHECK: aesdecwide256kl (%rbx) +0xf3 0x0f 0x38 0xd8 0x1b + +# CHECK: aesdecwide256kl 126(%rdx) +0xf3 0x0f 0x38 0xd8 0x5a 0x7e + +# CHECK: aesdec128kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xdd,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xdd,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xdd,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec128kl -1536(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xdd,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl 6096(%rcx), %xmm6 +0xf3,0x0f,0x38,0xdd,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl -6144(%rdx), %xmm6 +0xf3,0x0f,0x38,0xdd,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xdf,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xdf,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xdf,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec256kl -2048(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xdf,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl 8128(%rcx), %xmm6 +0xf3,0x0f,0x38,0xdf,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl -8192(%rdx), %xmm6 +0xf3,0x0f,0x38,0xdf,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xdc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xdc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xdc,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc128kl -1536(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xdc,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl 6096(%rcx), %xmm6 +0xf3,0x0f,0x38,0xdc,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl -6144(%rdx), %xmm6 +0xf3,0x0f,0x38,0xdc,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xde,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xde,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xde,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc256kl -2048(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xde,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl 8128(%rcx), %xmm6 +0xf3,0x0f,0x38,0xde,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl -8192(%rdx), %xmm6 +0xf3,0x0f,0x38,0xde,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey %xmm7, %xmm6 +0xf3,0x0f,0x38,0xdc,0xf7 + +# CHECK: aesdec128kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xdd,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xdd,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xdd,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec128kl -1536(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xdd,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl 6096(%rcx), %xmm6 +0xf3,0x0f,0x38,0xdd,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl -6144(%rdx), %xmm6 +0xf3,0x0f,0x38,0xdd,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xdf,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xdf,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xdf,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec256kl -2048(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xdf,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl 8128(%rcx), %xmm6 +0xf3,0x0f,0x38,0xdf,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl -8192(%rdx), %xmm6 +0xf3,0x0f,0x38,0xdf,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xdc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xdc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xdc,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc128kl -1536(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xdc,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl 6096(%rcx), %xmm6 +0xf3,0x0f,0x38,0xdc,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl -6144(%rdx), %xmm6 +0xf3,0x0f,0x38,0xdc,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl 268435456(%rbp,%r14,8), %xmm6 +0xf3,0x42,0x0f,0x38,0xde,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl 291(%r8,%rax,4), %xmm6 +0xf3,0x41,0x0f,0x38,0xde,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl (%rip), %xmm6 +0xf3,0x0f,0x38,0xde,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc256kl -2048(,%rbp,2), %xmm6 +0xf3,0x0f,0x38,0xde,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl 8128(%rcx), %xmm6 +0xf3,0x0f,0x38,0xde,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl -8192(%rdx), %xmm6 +0xf3,0x0f,0x38,0xde,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey %xmm7, %xmm6 +0xf3,0x0f,0x38,0xdc,0xf7 + +# CHECK: aesdecwide128kl 268435456(%rbp,%r14,8) +0xf3,0x42,0x0f,0x38,0xd8,0x8c,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide128kl 291(%r8,%rax,4) +0xf3,0x41,0x0f,0x38,0xd8,0x8c,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide128kl (%rip) +0xf3,0x0f,0x38,0xd8,0x0d,0x00,0x00,0x00,0x00 + +# CHECK: aesdecwide128kl -1536(,%rbp,2) +0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdecwide128kl 6096(%rcx) +0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00 + +# CHECK: aesdecwide128kl -6144(%rdx) +0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff + +# CHECK: aesdecwide256kl 268435456(%rbp,%r14,8) +0xf3,0x42,0x0f,0x38,0xd8,0x9c,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide256kl 291(%r8,%rax,4) +0xf3,0x41,0x0f,0x38,0xd8,0x9c,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide256kl (%rip) +0xf3,0x0f,0x38,0xd8,0x1d,0x00,0x00,0x00,0x00 + +# CHECK: aesdecwide256kl -2048(,%rbp,2) +0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdecwide256kl 8128(%rcx) +0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdecwide256kl -8192(%rdx) +0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff + +# CHECK: aesencwide128kl 268435456(%rbp,%r14,8) +0xf3,0x42,0x0f,0x38,0xd8,0x84,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide128kl 291(%r8,%rax,4) +0xf3,0x41,0x0f,0x38,0xd8,0x84,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide128kl (%rip) +0xf3,0x0f,0x38,0xd8,0x05,0x00,0x00,0x00,0x00 + +# CHECK: aesencwide128kl -1536(,%rbp,2) +0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesencwide128kl 6096(%rcx) +0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00 + +# CHECK: aesencwide128kl -6144(%rdx) +0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff + +# CHECK: aesencwide256kl 268435456(%rbp,%r14,8) +0xf3,0x42,0x0f,0x38,0xd8,0x94,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide256kl 291(%r8,%rax,4) +0xf3,0x41,0x0f,0x38,0xd8,0x94,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide256kl (%rip) +0xf3,0x0f,0x38,0xd8,0x15,0x00,0x00,0x00,0x00 + +# CHECK: aesencwide256kl -2048(,%rbp,2) +0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesencwide256kl 8128(%rcx) +0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesencwide256kl -8192(%rdx) +0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff + diff --git a/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-intel.txt b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-intel.txt new file mode 100644 index 00000000000000..262c6185f85b0a --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/KEYLOCKER/Keylocker-x86-64-intel.txt @@ -0,0 +1,223 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s + +# CHECK: aesdec128kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xdd,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xdd,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl xmm6, [rip] +0xf3,0x0f,0x38,0xdd,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec128kl xmm6, [2*rbp - 1536] +0xf3,0x0f,0x38,0xdd,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl xmm6, [rcx + 6096] +0xf3,0x0f,0x38,0xdd,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl xmm6, [rdx - 6144] +0xf3,0x0f,0x38,0xdd,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xdf,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xdf,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl xmm6, [rip] +0xf3,0x0f,0x38,0xdf,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec256kl xmm6, [2*rbp - 2048] +0xf3,0x0f,0x38,0xdf,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl xmm6, [rcx + 8128] +0xf3,0x0f,0x38,0xdf,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl xmm6, [rdx - 8192] +0xf3,0x0f,0x38,0xdf,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xdc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xdc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl xmm6, [rip] +0xf3,0x0f,0x38,0xdc,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc128kl xmm6, [2*rbp - 1536] +0xf3,0x0f,0x38,0xdc,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl xmm6, [rcx + 6096] +0xf3,0x0f,0x38,0xdc,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl xmm6, [rdx - 6144] +0xf3,0x0f,0x38,0xdc,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xde,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xde,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl xmm6, [rip] +0xf3,0x0f,0x38,0xde,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc256kl xmm6, [2*rbp - 2048] +0xf3,0x0f,0x38,0xde,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl xmm6, [rcx + 8128] +0xf3,0x0f,0x38,0xde,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl xmm6, [rdx - 8192] +0xf3,0x0f,0x38,0xde,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey xmm6, xmm7 +0xf3,0x0f,0x38,0xdc,0xf7 + +# CHECK: aesdec128kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xdd,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec128kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xdd,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec128kl xmm6, [rip] +0xf3,0x0f,0x38,0xdd,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec128kl xmm6, [2*rbp - 1536] +0xf3,0x0f,0x38,0xdd,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdec128kl xmm6, [rcx + 6096] +0xf3,0x0f,0x38,0xdd,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesdec128kl xmm6, [rdx - 6144] +0xf3,0x0f,0x38,0xdd,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesdec256kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xdf,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdec256kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xdf,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdec256kl xmm6, [rip] +0xf3,0x0f,0x38,0xdf,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesdec256kl xmm6, [2*rbp - 2048] +0xf3,0x0f,0x38,0xdf,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdec256kl xmm6, [rcx + 8128] +0xf3,0x0f,0x38,0xdf,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdec256kl xmm6, [rdx - 8192] +0xf3,0x0f,0x38,0xdf,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: aesenc128kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xdc,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc128kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xdc,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc128kl xmm6, [rip] +0xf3,0x0f,0x38,0xdc,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc128kl xmm6, [2*rbp - 1536] +0xf3,0x0f,0x38,0xdc,0x34,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesenc128kl xmm6, [rcx + 6096] +0xf3,0x0f,0x38,0xdc,0xb1,0xd0,0x17,0x00,0x00 + +# CHECK: aesenc128kl xmm6, [rdx - 6144] +0xf3,0x0f,0x38,0xdc,0xb2,0x00,0xe8,0xff,0xff + +# CHECK: aesenc256kl xmm6, [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xde,0xb4,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesenc256kl xmm6, [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xde,0xb4,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesenc256kl xmm6, [rip] +0xf3,0x0f,0x38,0xde,0x35,0x00,0x00,0x00,0x00 + +# CHECK: aesenc256kl xmm6, [2*rbp - 2048] +0xf3,0x0f,0x38,0xde,0x34,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesenc256kl xmm6, [rcx + 8128] +0xf3,0x0f,0x38,0xde,0xb1,0xc0,0x1f,0x00,0x00 + +# CHECK: aesenc256kl xmm6, [rdx - 8192] +0xf3,0x0f,0x38,0xde,0xb2,0x00,0xe0,0xff,0xff + +# CHECK: loadiwkey xmm6, xmm7 +0xf3,0x0f,0x38,0xdc,0xf7 + +# CHECK: aesdecwide128kl [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xd8,0x8c,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide128kl [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xd8,0x8c,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide128kl [rip] +0xf3,0x0f,0x38,0xd8,0x0d,0x00,0x00,0x00,0x00 + +# CHECK: aesdecwide128kl [2*rbp - 1536] +0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesdecwide128kl [rcx + 6096] +0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00 + +# CHECK: aesdecwide128kl [rdx - 6144] +0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff + +# CHECK: aesdecwide256kl [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xd8,0x9c,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesdecwide256kl [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xd8,0x9c,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesdecwide256kl [rip] +0xf3,0x0f,0x38,0xd8,0x1d,0x00,0x00,0x00,0x00 + +# CHECK: aesdecwide256kl [2*rbp - 2048] +0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesdecwide256kl [rcx + 8128] +0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00 + +# CHECK: aesdecwide256kl [rdx - 8192] +0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff + +# CHECK: aesencwide128kl [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xd8,0x84,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide128kl [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xd8,0x84,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide128kl [rip] +0xf3,0x0f,0x38,0xd8,0x05,0x00,0x00,0x00,0x00 + +# CHECK: aesencwide128kl [2*rbp - 1536] +0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff + +# CHECK: aesencwide128kl [rcx + 6096] +0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00 + +# CHECK: aesencwide128kl [rdx - 6144] +0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff + +# CHECK: aesencwide256kl [rbp + 8*r14 + 268435456] +0xf3,0x42,0x0f,0x38,0xd8,0x94,0xf5,0x00,0x00,0x00,0x10 + +# CHECK: aesencwide256kl [r8 + 4*rax + 291] +0xf3,0x41,0x0f,0x38,0xd8,0x94,0x80,0x23,0x01,0x00,0x00 + +# CHECK: aesencwide256kl [rip] +0xf3,0x0f,0x38,0xd8,0x15,0x00,0x00,0x00,0x00 + +# CHECK: aesencwide256kl [2*rbp - 2048] +0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff + +# CHECK: aesencwide256kl [rcx + 8128] +0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00 + +# CHECK: aesencwide256kl [rdx - 8192] +0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff diff --git a/llvm/test/MC/X86/KEYLOCKER/keylocker-att.s b/llvm/test/MC/X86/KEYLOCKER/keylocker-att.s new file mode 100644 index 00000000000000..3352a2f5ec810b --- /dev/null +++ b/llvm/test/MC/X86/KEYLOCKER/keylocker-att.s @@ -0,0 +1,205 @@ +// RUN: llvm-mc -triple i386-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: aesdec128kl 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x94,0xf4,0x00,0x00,0x00,0x10] + aesdec128kl 268435456(%esp,%esi,8), %xmm2 + +// CHECK: aesdec128kl 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x94,0x87,0x23,0x01,0x00,0x00] + aesdec128kl 291(%edi,%eax,4), %xmm2 + +// CHECK: aesdec128kl (%eax), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x10] + aesdec128kl (%eax), %xmm2 + +// CHECK: aesdec128kl -1536(,%ebp,2), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x14,0x6d,0x00,0xfa,0xff,0xff] + aesdec128kl -1536(,%ebp,2), %xmm2 + +// CHECK: aesdec128kl 6096(%ecx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x91,0xd0,0x17,0x00,0x00] + aesdec128kl 6096(%ecx), %xmm2 + +// CHECK: aesdec128kl -6144(%edx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x92,0x00,0xe8,0xff,0xff] + aesdec128kl -6144(%edx), %xmm2 + +// CHECK: aesdec256kl 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x94,0xf4,0x00,0x00,0x00,0x10] + aesdec256kl 268435456(%esp,%esi,8), %xmm2 + +// CHECK: aesdec256kl 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x94,0x87,0x23,0x01,0x00,0x00] + aesdec256kl 291(%edi,%eax,4), %xmm2 + +// CHECK: aesdec256kl (%eax), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x10] + aesdec256kl (%eax), %xmm2 + +// CHECK: aesdec256kl -2048(,%ebp,2), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesdec256kl -2048(,%ebp,2), %xmm2 + +// CHECK: aesdec256kl 8128(%ecx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x91,0xc0,0x1f,0x00,0x00] + aesdec256kl 8128(%ecx), %xmm2 + +// CHECK: aesdec256kl -8192(%edx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x92,0x00,0xe0,0xff,0xff] + aesdec256kl -8192(%edx), %xmm2 + +// CHECK: aesenc128kl 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x94,0xf4,0x00,0x00,0x00,0x10] + aesenc128kl 268435456(%esp,%esi,8), %xmm2 + +// CHECK: aesenc128kl 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x94,0x87,0x23,0x01,0x00,0x00] + aesenc128kl 291(%edi,%eax,4), %xmm2 + +// CHECK: aesenc128kl (%eax), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x10] + aesenc128kl (%eax), %xmm2 + +// CHECK: aesenc128kl -1536(,%ebp,2), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x14,0x6d,0x00,0xfa,0xff,0xff] + aesenc128kl -1536(,%ebp,2), %xmm2 + +// CHECK: aesenc128kl 6096(%ecx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x91,0xd0,0x17,0x00,0x00] + aesenc128kl 6096(%ecx), %xmm2 + +// CHECK: aesenc128kl -6144(%edx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x92,0x00,0xe8,0xff,0xff] + aesenc128kl -6144(%edx), %xmm2 + +// CHECK: aesenc256kl 268435456(%esp,%esi,8), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x94,0xf4,0x00,0x00,0x00,0x10] + aesenc256kl 268435456(%esp,%esi,8), %xmm2 + +// CHECK: aesenc256kl 291(%edi,%eax,4), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x94,0x87,0x23,0x01,0x00,0x00] + aesenc256kl 291(%edi,%eax,4), %xmm2 + +// CHECK: aesenc256kl (%eax), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x10] + aesenc256kl (%eax), %xmm2 + +// CHECK: aesenc256kl -2048(,%ebp,2), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesenc256kl -2048(,%ebp,2), %xmm2 + +// CHECK: aesenc256kl 8128(%ecx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x91,0xc0,0x1f,0x00,0x00] + aesenc256kl 8128(%ecx), %xmm2 + +// CHECK: aesenc256kl -8192(%edx), %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x92,0x00,0xe0,0xff,0xff] + aesenc256kl -8192(%edx), %xmm2 + +// CHECK: encodekey128 %ecx, %ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfa,0xc9] + encodekey128 %ecx, %ecx + +// CHECK: encodekey256 %ecx, %ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfb,0xc9] + encodekey256 %ecx, %ecx + +// CHECK: loadiwkey %xmm3, %xmm2 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xd3] + loadiwkey %xmm3, %xmm2 + +// CHECK: aesdecwide128kl 268435456(%esp,%esi,8) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8c,0xf4,0x00,0x00,0x00,0x10] + aesdecwide128kl 268435456(%esp,%esi,8) + +// CHECK: aesdecwide128kl 291(%edi,%eax,4) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8c,0x87,0x23,0x01,0x00,0x00] + aesdecwide128kl 291(%edi,%eax,4) + +// CHECK: aesdecwide128kl (%eax) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x08] + aesdecwide128kl (%eax) + +// CHECK: aesdecwide128kl -1536(,%ebp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff] + aesdecwide128kl -1536(,%ebp,2) + +// CHECK: aesdecwide128kl 6096(%ecx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00] + aesdecwide128kl 6096(%ecx) + +// CHECK: aesdecwide128kl -6144(%edx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff] + aesdecwide128kl -6144(%edx) + +// CHECK: aesdecwide256kl 268435456(%esp,%esi,8) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9c,0xf4,0x00,0x00,0x00,0x10] + aesdecwide256kl 268435456(%esp,%esi,8) + +// CHECK: aesdecwide256kl 291(%edi,%eax,4) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9c,0x87,0x23,0x01,0x00,0x00] + aesdecwide256kl 291(%edi,%eax,4) + +// CHECK: aesdecwide256kl (%eax) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x18] + aesdecwide256kl (%eax) + +// CHECK: aesdecwide256kl -2048(,%ebp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff] + aesdecwide256kl -2048(,%ebp,2) + +// CHECK: aesdecwide256kl 8128(%ecx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00] + aesdecwide256kl 8128(%ecx) + +// CHECK: aesdecwide256kl -8192(%edx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff] + aesdecwide256kl -8192(%edx) + +// CHECK: aesencwide128kl 268435456(%esp,%esi,8) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x84,0xf4,0x00,0x00,0x00,0x10] + aesencwide128kl 268435456(%esp,%esi,8) + +// CHECK: aesencwide128kl 291(%edi,%eax,4) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x84,0x87,0x23,0x01,0x00,0x00] + aesencwide128kl 291(%edi,%eax,4) + +// CHECK: aesencwide128kl (%eax) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x00] + aesencwide128kl (%eax) + +// CHECK: aesencwide128kl -1536(,%ebp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff] + aesencwide128kl -1536(,%ebp,2) + +// CHECK: aesencwide128kl 6096(%ecx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00] + aesencwide128kl 6096(%ecx) + +// CHECK: aesencwide128kl -6144(%edx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff] + aesencwide128kl -6144(%edx) + +// CHECK: aesencwide256kl 268435456(%esp,%esi,8) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x94,0xf4,0x00,0x00,0x00,0x10] + aesencwide256kl 268435456(%esp,%esi,8) + +// CHECK: aesencwide256kl 291(%edi,%eax,4) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x94,0x87,0x23,0x01,0x00,0x00] + aesencwide256kl 291(%edi,%eax,4) + +// CHECK: aesencwide256kl (%eax) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x10] + aesencwide256kl (%eax) + +// CHECK: aesencwide256kl -2048(,%ebp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesencwide256kl -2048(,%ebp,2) + +// CHECK: aesencwide256kl 8128(%ecx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00] + aesencwide256kl 8128(%ecx) + +// CHECK: aesencwide256kl -8192(%edx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff] + aesencwide256kl -8192(%edx) diff --git a/llvm/test/MC/X86/KEYLOCKER/keylocker-intel.s b/llvm/test/MC/X86/KEYLOCKER/keylocker-intel.s new file mode 100644 index 00000000000000..7eb1e0df8c5595 --- /dev/null +++ b/llvm/test/MC/X86/KEYLOCKER/keylocker-intel.s @@ -0,0 +1,205 @@ +// RUN: llvm-mc -triple i386-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: aesdec128kl xmm2, [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x94,0xf4,0x00,0x00,0x00,0x10] + aesdec128kl xmm2, [esp + 8*esi + 268435456] + +// CHECK: aesdec128kl xmm2, [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x94,0x87,0x23,0x01,0x00,0x00] + aesdec128kl xmm2, [edi + 4*eax + 291] + +// CHECK: aesdec128kl xmm2, [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x10] + aesdec128kl xmm2, [eax] + +// CHECK: aesdec128kl xmm2, [2*ebp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x14,0x6d,0x00,0xfa,0xff,0xff] + aesdec128kl xmm2, [2*ebp - 1536] + +// CHECK: aesdec128kl xmm2, [ecx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x91,0xd0,0x17,0x00,0x00] + aesdec128kl xmm2, [ecx + 6096] + +// CHECK: aesdec128kl xmm2, [edx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x92,0x00,0xe8,0xff,0xff] + aesdec128kl xmm2, [edx - 6144] + +// CHECK: aesdec256kl xmm2, [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x94,0xf4,0x00,0x00,0x00,0x10] + aesdec256kl xmm2, [esp + 8*esi + 268435456] + +// CHECK: aesdec256kl xmm2, [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x94,0x87,0x23,0x01,0x00,0x00] + aesdec256kl xmm2, [edi + 4*eax + 291] + +// CHECK: aesdec256kl xmm2, [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x10] + aesdec256kl xmm2, [eax] + +// CHECK: aesdec256kl xmm2, [2*ebp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesdec256kl xmm2, [2*ebp - 2048] + +// CHECK: aesdec256kl xmm2, [ecx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x91,0xc0,0x1f,0x00,0x00] + aesdec256kl xmm2, [ecx + 8128] + +// CHECK: aesdec256kl xmm2, [edx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x92,0x00,0xe0,0xff,0xff] + aesdec256kl xmm2, [edx - 8192] + +// CHECK: aesenc128kl xmm2, [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x94,0xf4,0x00,0x00,0x00,0x10] + aesenc128kl xmm2, [esp + 8*esi + 268435456] + +// CHECK: aesenc128kl xmm2, [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x94,0x87,0x23,0x01,0x00,0x00] + aesenc128kl xmm2, [edi + 4*eax + 291] + +// CHECK: aesenc128kl xmm2, [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x10] + aesenc128kl xmm2, [eax] + +// CHECK: aesenc128kl xmm2, [2*ebp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x14,0x6d,0x00,0xfa,0xff,0xff] + aesenc128kl xmm2, [2*ebp - 1536] + +// CHECK: aesenc128kl xmm2, [ecx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x91,0xd0,0x17,0x00,0x00] + aesenc128kl xmm2, [ecx + 6096] + +// CHECK: aesenc128kl xmm2, [edx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x92,0x00,0xe8,0xff,0xff] + aesenc128kl xmm2, [edx - 6144] + +// CHECK: aesenc256kl xmm2, [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x94,0xf4,0x00,0x00,0x00,0x10] + aesenc256kl xmm2, [esp + 8*esi + 268435456] + +// CHECK: aesenc256kl xmm2, [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x94,0x87,0x23,0x01,0x00,0x00] + aesenc256kl xmm2, [edi + 4*eax + 291] + +// CHECK: aesenc256kl xmm2, [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x10] + aesenc256kl xmm2, [eax] + +// CHECK: aesenc256kl xmm2, [2*ebp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesenc256kl xmm2, [2*ebp - 2048] + +// CHECK: aesenc256kl xmm2, [ecx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x91,0xc0,0x1f,0x00,0x00] + aesenc256kl xmm2, [ecx + 8128] + +// CHECK: aesenc256kl xmm2, [edx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x92,0x00,0xe0,0xff,0xff] + aesenc256kl xmm2, [edx - 8192] + +// CHECK: encodekey128 ecx, ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfa,0xc9] + encodekey128 ecx, ecx + +// CHECK: encodekey256 ecx, ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfb,0xc9] + encodekey256 ecx, ecx + +// CHECK: loadiwkey xmm2, xmm3 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xd3] + loadiwkey xmm2, xmm3 + +// CHECK: aesdecwide128kl [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8c,0xf4,0x00,0x00,0x00,0x10] + aesdecwide128kl [esp + 8*esi + 268435456] + +// CHECK: aesdecwide128kl [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8c,0x87,0x23,0x01,0x00,0x00] + aesdecwide128kl [edi + 4*eax + 291] + +// CHECK: aesdecwide128kl [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x08] + aesdecwide128kl [eax] + +// CHECK: aesdecwide128kl [2*ebp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff] + aesdecwide128kl [2*ebp - 1536] + +// CHECK: aesdecwide128kl [ecx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00] + aesdecwide128kl [ecx + 6096] + +// CHECK: aesdecwide128kl [edx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff] + aesdecwide128kl [edx - 6144] + +// CHECK: aesdecwide256kl [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9c,0xf4,0x00,0x00,0x00,0x10] + aesdecwide256kl [esp + 8*esi + 268435456] + +// CHECK: aesdecwide256kl [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9c,0x87,0x23,0x01,0x00,0x00] + aesdecwide256kl [edi + 4*eax + 291] + +// CHECK: aesdecwide256kl [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x18] + aesdecwide256kl [eax] + +// CHECK: aesdecwide256kl [2*ebp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff] + aesdecwide256kl [2*ebp - 2048] + +// CHECK: aesdecwide256kl [ecx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00] + aesdecwide256kl [ecx + 8128] + +// CHECK: aesdecwide256kl [edx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff] + aesdecwide256kl [edx - 8192] + +// CHECK: aesencwide128kl [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x84,0xf4,0x00,0x00,0x00,0x10] + aesencwide128kl [esp + 8*esi + 268435456] + +// CHECK: aesencwide128kl [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x84,0x87,0x23,0x01,0x00,0x00] + aesencwide128kl [edi + 4*eax + 291] + +// CHECK: aesencwide128kl [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x00] + aesencwide128kl [eax] + +// CHECK: aesencwide128kl [2*ebp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff] + aesencwide128kl [2*ebp - 1536] + +// CHECK: aesencwide128kl [ecx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00] + aesencwide128kl [ecx + 6096] + +// CHECK: aesencwide128kl [edx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff] + aesencwide128kl [edx - 6144] + +// CHECK: aesencwide256kl [esp + 8*esi + 268435456] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x94,0xf4,0x00,0x00,0x00,0x10] + aesencwide256kl [esp + 8*esi + 268435456] + +// CHECK: aesencwide256kl [edi + 4*eax + 291] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x94,0x87,0x23,0x01,0x00,0x00] + aesencwide256kl [edi + 4*eax + 291] + +// CHECK: aesencwide256kl [eax] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x10] + aesencwide256kl [eax] + +// CHECK: aesencwide256kl [2*ebp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesencwide256kl [2*ebp - 2048] + +// CHECK: aesencwide256kl [ecx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00] + aesencwide256kl [ecx + 8128] + +// CHECK: aesencwide256kl [edx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff] + aesencwide256kl [edx - 8192] diff --git a/llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-att.s b/llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-att.s new file mode 100644 index 00000000000000..dc467d76c28722 --- /dev/null +++ b/llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-att.s @@ -0,0 +1,205 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: aesdec128kl 268435456(%rbp,%r14,8), %xmm6 +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xdd,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesdec128kl 268435456(%rbp,%r14,8), %xmm6 + +// CHECK: aesdec128kl 291(%r8,%rax,4), %xmm6 +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xdd,0xb4,0x80,0x23,0x01,0x00,0x00] + aesdec128kl 291(%r8,%rax,4), %xmm6 + +// CHECK: aesdec128kl (%rip), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x35,0x00,0x00,0x00,0x00] + aesdec128kl (%rip), %xmm6 + +// CHECK: aesdec128kl -1536(,%rbp,2), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x34,0x6d,0x00,0xfa,0xff,0xff] + aesdec128kl -1536(,%rbp,2), %xmm6 + +// CHECK: aesdec128kl 6096(%rcx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0xb1,0xd0,0x17,0x00,0x00] + aesdec128kl 6096(%rcx), %xmm6 + +// CHECK: aesdec128kl -6144(%rdx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0xb2,0x00,0xe8,0xff,0xff] + aesdec128kl -6144(%rdx), %xmm6 + +// CHECK: aesdec256kl 268435456(%rbp,%r14,8), %xmm6 +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xdf,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesdec256kl 268435456(%rbp,%r14,8), %xmm6 + +// CHECK: aesdec256kl 291(%r8,%rax,4), %xmm6 +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xdf,0xb4,0x80,0x23,0x01,0x00,0x00] + aesdec256kl 291(%r8,%rax,4), %xmm6 + +// CHECK: aesdec256kl (%rip), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x35,0x00,0x00,0x00,0x00] + aesdec256kl (%rip), %xmm6 + +// CHECK: aesdec256kl -2048(,%rbp,2), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x34,0x6d,0x00,0xf8,0xff,0xff] + aesdec256kl -2048(,%rbp,2), %xmm6 + +// CHECK: aesdec256kl 8128(%rcx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0xb1,0xc0,0x1f,0x00,0x00] + aesdec256kl 8128(%rcx), %xmm6 + +// CHECK: aesdec256kl -8192(%rdx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0xb2,0x00,0xe0,0xff,0xff] + aesdec256kl -8192(%rdx), %xmm6 + +// CHECK: aesenc128kl 268435456(%rbp,%r14,8), %xmm6 +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xdc,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesenc128kl 268435456(%rbp,%r14,8), %xmm6 + +// CHECK: aesenc128kl 291(%r8,%rax,4), %xmm6 +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xdc,0xb4,0x80,0x23,0x01,0x00,0x00] + aesenc128kl 291(%r8,%rax,4), %xmm6 + +// CHECK: aesenc128kl (%rip), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x35,0x00,0x00,0x00,0x00] + aesenc128kl (%rip), %xmm6 + +// CHECK: aesenc128kl -1536(,%rbp,2), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x34,0x6d,0x00,0xfa,0xff,0xff] + aesenc128kl -1536(,%rbp,2), %xmm6 + +// CHECK: aesenc128kl 6096(%rcx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xb1,0xd0,0x17,0x00,0x00] + aesenc128kl 6096(%rcx), %xmm6 + +// CHECK: aesenc128kl -6144(%rdx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xb2,0x00,0xe8,0xff,0xff] + aesenc128kl -6144(%rdx), %xmm6 + +// CHECK: aesenc256kl 268435456(%rbp,%r14,8), %xmm6 +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xde,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesenc256kl 268435456(%rbp,%r14,8), %xmm6 + +// CHECK: aesenc256kl 291(%r8,%rax,4), %xmm6 +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xde,0xb4,0x80,0x23,0x01,0x00,0x00] + aesenc256kl 291(%r8,%rax,4), %xmm6 + +// CHECK: aesenc256kl (%rip), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x35,0x00,0x00,0x00,0x00] + aesenc256kl (%rip), %xmm6 + +// CHECK: aesenc256kl -2048(,%rbp,2), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x34,0x6d,0x00,0xf8,0xff,0xff] + aesenc256kl -2048(,%rbp,2), %xmm6 + +// CHECK: aesenc256kl 8128(%rcx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0xb1,0xc0,0x1f,0x00,0x00] + aesenc256kl 8128(%rcx), %xmm6 + +// CHECK: aesenc256kl -8192(%rdx), %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0xb2,0x00,0xe0,0xff,0xff] + aesenc256kl -8192(%rdx), %xmm6 + +// CHECK: encodekey128 %ecx, %ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfa,0xc9] + encodekey128 %ecx, %ecx + +// CHECK: encodekey256 %ecx, %ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfb,0xc9] + encodekey256 %ecx, %ecx + +// CHECK: loadiwkey %xmm7, %xmm6 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xf7] + loadiwkey %xmm7, %xmm6 + +// CHECK: aesdecwide128kl 268435456(%rbp,%r14,8) +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x8c,0xf5,0x00,0x00,0x00,0x10] + aesdecwide128kl 268435456(%rbp,%r14,8) + +// CHECK: aesdecwide128kl 291(%r8,%rax,4) +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x8c,0x80,0x23,0x01,0x00,0x00] + aesdecwide128kl 291(%r8,%rax,4) + +// CHECK: aesdecwide128kl (%rip) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x0d,0x00,0x00,0x00,0x00] + aesdecwide128kl (%rip) + +// CHECK: aesdecwide128kl -1536(,%rbp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff] + aesdecwide128kl -1536(,%rbp,2) + +// CHECK: aesdecwide128kl 6096(%rcx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00] + aesdecwide128kl 6096(%rcx) + +// CHECK: aesdecwide128kl -6144(%rdx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff] + aesdecwide128kl -6144(%rdx) + +// CHECK: aesdecwide256kl 268435456(%rbp,%r14,8) +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x9c,0xf5,0x00,0x00,0x00,0x10] + aesdecwide256kl 268435456(%rbp,%r14,8) + +// CHECK: aesdecwide256kl 291(%r8,%rax,4) +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x9c,0x80,0x23,0x01,0x00,0x00] + aesdecwide256kl 291(%r8,%rax,4) + +// CHECK: aesdecwide256kl (%rip) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x1d,0x00,0x00,0x00,0x00] + aesdecwide256kl (%rip) + +// CHECK: aesdecwide256kl -2048(,%rbp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff] + aesdecwide256kl -2048(,%rbp,2) + +// CHECK: aesdecwide256kl 8128(%rcx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00] + aesdecwide256kl 8128(%rcx) + +// CHECK: aesdecwide256kl -8192(%rdx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff] + aesdecwide256kl -8192(%rdx) + +// CHECK: aesencwide128kl 268435456(%rbp,%r14,8) +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x84,0xf5,0x00,0x00,0x00,0x10] + aesencwide128kl 268435456(%rbp,%r14,8) + +// CHECK: aesencwide128kl 291(%r8,%rax,4) +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x84,0x80,0x23,0x01,0x00,0x00] + aesencwide128kl 291(%r8,%rax,4) + +// CHECK: aesencwide128kl (%rip) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x05,0x00,0x00,0x00,0x00] + aesencwide128kl (%rip) + +// CHECK: aesencwide128kl -1536(,%rbp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff] + aesencwide128kl -1536(,%rbp,2) + +// CHECK: aesencwide128kl 6096(%rcx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00] + aesencwide128kl 6096(%rcx) + +// CHECK: aesencwide128kl -6144(%rdx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff] + aesencwide128kl -6144(%rdx) + +// CHECK: aesencwide256kl 268435456(%rbp,%r14,8) +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x94,0xf5,0x00,0x00,0x00,0x10] + aesencwide256kl 268435456(%rbp,%r14,8) + +// CHECK: aesencwide256kl 291(%r8,%rax,4) +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x94,0x80,0x23,0x01,0x00,0x00] + aesencwide256kl 291(%r8,%rax,4) + +// CHECK: aesencwide256kl (%rip) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x15,0x00,0x00,0x00,0x00] + aesencwide256kl (%rip) + +// CHECK: aesencwide256kl -2048(,%rbp,2) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesencwide256kl -2048(,%rbp,2) + +// CHECK: aesencwide256kl 8128(%rcx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00] + aesencwide256kl 8128(%rcx) + +// CHECK: aesencwide256kl -8192(%rdx) +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff] + aesencwide256kl -8192(%rdx) diff --git a/llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-intel.s b/llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-intel.s new file mode 100644 index 00000000000000..cb8921acdc1ff0 --- /dev/null +++ b/llvm/test/MC/X86/KEYLOCKER/x86-64-keylocker-intel.s @@ -0,0 +1,205 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: aesdec128kl xmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xdd,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesdec128kl xmm6, [rbp + 8*r14 + 268435456] + +// CHECK: aesdec128kl xmm6, [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xdd,0xb4,0x80,0x23,0x01,0x00,0x00] + aesdec128kl xmm6, [r8 + 4*rax + 291] + +// CHECK: aesdec128kl xmm6, [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x35,0x00,0x00,0x00,0x00] + aesdec128kl xmm6, [rip] + +// CHECK: aesdec128kl xmm6, [2*rbp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0x34,0x6d,0x00,0xfa,0xff,0xff] + aesdec128kl xmm6, [2*rbp - 1536] + +// CHECK: aesdec128kl xmm6, [rcx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0xb1,0xd0,0x17,0x00,0x00] + aesdec128kl xmm6, [rcx + 6096] + +// CHECK: aesdec128kl xmm6, [rdx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdd,0xb2,0x00,0xe8,0xff,0xff] + aesdec128kl xmm6, [rdx - 6144] + +// CHECK: aesdec256kl xmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xdf,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesdec256kl xmm6, [rbp + 8*r14 + 268435456] + +// CHECK: aesdec256kl xmm6, [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xdf,0xb4,0x80,0x23,0x01,0x00,0x00] + aesdec256kl xmm6, [r8 + 4*rax + 291] + +// CHECK: aesdec256kl xmm6, [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x35,0x00,0x00,0x00,0x00] + aesdec256kl xmm6, [rip] + +// CHECK: aesdec256kl xmm6, [2*rbp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0x34,0x6d,0x00,0xf8,0xff,0xff] + aesdec256kl xmm6, [2*rbp - 2048] + +// CHECK: aesdec256kl xmm6, [rcx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0xb1,0xc0,0x1f,0x00,0x00] + aesdec256kl xmm6, [rcx + 8128] + +// CHECK: aesdec256kl xmm6, [rdx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdf,0xb2,0x00,0xe0,0xff,0xff] + aesdec256kl xmm6, [rdx - 8192] + +// CHECK: aesenc128kl xmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xdc,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesenc128kl xmm6, [rbp + 8*r14 + 268435456] + +// CHECK: aesenc128kl xmm6, [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xdc,0xb4,0x80,0x23,0x01,0x00,0x00] + aesenc128kl xmm6, [r8 + 4*rax + 291] + +// CHECK: aesenc128kl xmm6, [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x35,0x00,0x00,0x00,0x00] + aesenc128kl xmm6, [rip] + +// CHECK: aesenc128kl xmm6, [2*rbp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0x34,0x6d,0x00,0xfa,0xff,0xff] + aesenc128kl xmm6, [2*rbp - 1536] + +// CHECK: aesenc128kl xmm6, [rcx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xb1,0xd0,0x17,0x00,0x00] + aesenc128kl xmm6, [rcx + 6096] + +// CHECK: aesenc128kl xmm6, [rdx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xb2,0x00,0xe8,0xff,0xff] + aesenc128kl xmm6, [rdx - 6144] + +// CHECK: aesenc256kl xmm6, [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xde,0xb4,0xf5,0x00,0x00,0x00,0x10] + aesenc256kl xmm6, [rbp + 8*r14 + 268435456] + +// CHECK: aesenc256kl xmm6, [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xde,0xb4,0x80,0x23,0x01,0x00,0x00] + aesenc256kl xmm6, [r8 + 4*rax + 291] + +// CHECK: aesenc256kl xmm6, [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x35,0x00,0x00,0x00,0x00] + aesenc256kl xmm6, [rip] + +// CHECK: aesenc256kl xmm6, [2*rbp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0x34,0x6d,0x00,0xf8,0xff,0xff] + aesenc256kl xmm6, [2*rbp - 2048] + +// CHECK: aesenc256kl xmm6, [rcx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0xb1,0xc0,0x1f,0x00,0x00] + aesenc256kl xmm6, [rcx + 8128] + +// CHECK: aesenc256kl xmm6, [rdx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xde,0xb2,0x00,0xe0,0xff,0xff] + aesenc256kl xmm6, [rdx - 8192] + +// CHECK: encodekey128 ecx, ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfa,0xc9] + encodekey128 ecx, ecx + +// CHECK: encodekey256 ecx, ecx +// CHECK: encoding: [0xf3,0x0f,0x38,0xfb,0xc9] + encodekey256 ecx, ecx + +// CHECK: loadiwkey xmm6, xmm7 +// CHECK: encoding: [0xf3,0x0f,0x38,0xdc,0xf7] + loadiwkey xmm6, xmm7 + +// CHECK: aesdecwide128kl [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x8c,0xf5,0x00,0x00,0x00,0x10] + aesdecwide128kl [rbp + 8*r14 + 268435456] + +// CHECK: aesdecwide128kl [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x8c,0x80,0x23,0x01,0x00,0x00] + aesdecwide128kl [r8 + 4*rax + 291] + +// CHECK: aesdecwide128kl [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x0d,0x00,0x00,0x00,0x00] + aesdecwide128kl [rip] + +// CHECK: aesdecwide128kl [2*rbp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x0c,0x6d,0x00,0xfa,0xff,0xff] + aesdecwide128kl [2*rbp - 1536] + +// CHECK: aesdecwide128kl [rcx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x89,0xd0,0x17,0x00,0x00] + aesdecwide128kl [rcx + 6096] + +// CHECK: aesdecwide128kl [rdx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x8a,0x00,0xe8,0xff,0xff] + aesdecwide128kl [rdx - 6144] + +// CHECK: aesdecwide256kl [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x9c,0xf5,0x00,0x00,0x00,0x10] + aesdecwide256kl [rbp + 8*r14 + 268435456] + +// CHECK: aesdecwide256kl [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x9c,0x80,0x23,0x01,0x00,0x00] + aesdecwide256kl [r8 + 4*rax + 291] + +// CHECK: aesdecwide256kl [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x1d,0x00,0x00,0x00,0x00] + aesdecwide256kl [rip] + +// CHECK: aesdecwide256kl [2*rbp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x1c,0x6d,0x00,0xf8,0xff,0xff] + aesdecwide256kl [2*rbp - 2048] + +// CHECK: aesdecwide256kl [rcx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x99,0xc0,0x1f,0x00,0x00] + aesdecwide256kl [rcx + 8128] + +// CHECK: aesdecwide256kl [rdx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x9a,0x00,0xe0,0xff,0xff] + aesdecwide256kl [rdx - 8192] + +// CHECK: aesencwide128kl [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x84,0xf5,0x00,0x00,0x00,0x10] + aesencwide128kl [rbp + 8*r14 + 268435456] + +// CHECK: aesencwide128kl [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x84,0x80,0x23,0x01,0x00,0x00] + aesencwide128kl [r8 + 4*rax + 291] + +// CHECK: aesencwide128kl [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x05,0x00,0x00,0x00,0x00] + aesencwide128kl [rip] + +// CHECK: aesencwide128kl [2*rbp - 1536] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x04,0x6d,0x00,0xfa,0xff,0xff] + aesencwide128kl [2*rbp - 1536] + +// CHECK: aesencwide128kl [rcx + 6096] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x81,0xd0,0x17,0x00,0x00] + aesencwide128kl [rcx + 6096] + +// CHECK: aesencwide128kl [rdx - 6144] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x82,0x00,0xe8,0xff,0xff] + aesencwide128kl [rdx - 6144] + +// CHECK: aesencwide256kl [rbp + 8*r14 + 268435456] +// CHECK: encoding: [0xf3,0x42,0x0f,0x38,0xd8,0x94,0xf5,0x00,0x00,0x00,0x10] + aesencwide256kl [rbp + 8*r14 + 268435456] + +// CHECK: aesencwide256kl [r8 + 4*rax + 291] +// CHECK: encoding: [0xf3,0x41,0x0f,0x38,0xd8,0x94,0x80,0x23,0x01,0x00,0x00] + aesencwide256kl [r8 + 4*rax + 291] + +// CHECK: aesencwide256kl [rip] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x15,0x00,0x00,0x00,0x00] + aesencwide256kl [rip] + +// CHECK: aesencwide256kl [2*rbp - 2048] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x14,0x6d,0x00,0xf8,0xff,0xff] + aesencwide256kl [2*rbp - 2048] + +// CHECK: aesencwide256kl [rcx + 8128] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x91,0xc0,0x1f,0x00,0x00] + aesencwide256kl [rcx + 8128] + +// CHECK: aesencwide256kl [rdx - 8192] +// CHECK: encoding: [0xf3,0x0f,0x38,0xd8,0x92,0x00,0xe0,0xff,0xff] + aesencwide256kl [rdx - 8192] diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 4e368fac2c834d..6a8a60d00639dd 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -246,7 +246,8 @@ enum IIT_Info { IIT_SUBDIVIDE4_ARG = 45, IIT_VEC_OF_BITCASTS_TO_INT = 46, IIT_V128 = 47, - IIT_BF16 = 48 + IIT_BF16 = 48, + IIT_STRUCT9 = 49 }; static void EncodeFixedValueType(MVT::SimpleValueType VT, @@ -469,6 +470,7 @@ static void ComputeFixedEncoding(const CodeGenIntrinsic &Int, case 6: TypeSig.push_back(IIT_STRUCT6); break; case 7: TypeSig.push_back(IIT_STRUCT7); break; case 8: TypeSig.push_back(IIT_STRUCT8); break; + case 9: TypeSig.push_back(IIT_STRUCT9); break; default: llvm_unreachable("Unhandled case in struct"); } From e39d7884a1f5c5c7136ba2e493e9ac313ccc78ed Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 30 Sep 2020 10:09:34 +0000 Subject: [PATCH 2/8] [gn build] Port 413577a8790 --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index c43e531fc71801..811faf52b18315 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -143,6 +143,8 @@ copy("Headers") { "inttypes.h", "invpcidintrin.h", "iso646.h", + "keylocker_wide_intrin.h", + "keylockerintrin.h", "limits.h", "lwpintrin.h", "lzcntintrin.h", From ec3f24d4538d1c262377331c7b35ea66e023cf98 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 30 Sep 2020 11:13:54 +0100 Subject: [PATCH 3/8] [InstCombine] recognizeBSwapOrBitReverseIdiom - assert for correct bit providence indices. NFCI. As suggested by @spatel on D88316 --- llvm/lib/Transforms/Utils/Local.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 61f4dffb641ca0..8ff11ba4cab47d 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3027,6 +3027,9 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( if (!Res) return false; auto &BitProvenance = Res->Provenance; + assert(all_of(BitProvenance, + [](int8_t I) { return I == BitPart::Unset || 0 <= I; }) && + "Illegal bit provenance index"); // Now, is the bit permutation correct for a bswap or a bitreverse? We can // only byteswap values with an even number of bytes. From af47d40b9c68744eb66aa2ef779065e946aaa099 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 30 Sep 2020 12:07:19 +0100 Subject: [PATCH 4/8] [InstCombine] recognizeBSwapOrBitReverseIdiom - recognise zext(bswap(trunc(x))) patterns (PR39793) PR39793 demonstrated an issue where we fail to recognize 'partial' bswap patterns of the lower bytes of an integer source. In fact, most of this is already in place collectBitParts suitably tags zero bits, so we just need to correctly handle this case by finding the zero'd upper bits and reducing the bswap pattern just to the active demanded bits. Differential Revision: https://reviews.llvm.org/D88316 --- llvm/lib/Transforms/Utils/Local.cpp | 21 +++--- llvm/test/Transforms/InstCombine/bswap.ll | 82 +++++++++++------------ 2 files changed, 54 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 8ff11ba4cab47d..4eb458d217e023 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3010,29 +3010,34 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( IntegerType *ITy = dyn_cast(I->getType()); if (!ITy || ITy->getBitWidth() > 128) return false; // Can't do vectors or integers > 128 bits. - unsigned BW = ITy->getBitWidth(); - unsigned DemandedBW = BW; IntegerType *DemandedTy = ITy; - if (I->hasOneUse()) { - if (TruncInst *Trunc = dyn_cast(I->user_back())) { + if (I->hasOneUse()) + if (auto *Trunc = dyn_cast(I->user_back())) DemandedTy = cast(Trunc->getType()); - DemandedBW = DemandedTy->getBitWidth(); - } - } // Try to find all the pieces corresponding to the bswap. std::map> BPS; auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS, 0); if (!Res) return false; - auto &BitProvenance = Res->Provenance; + ArrayRef BitProvenance = Res->Provenance; assert(all_of(BitProvenance, [](int8_t I) { return I == BitPart::Unset || 0 <= I; }) && "Illegal bit provenance index"); + // If the upper bits are zero, then attempt to perform as a truncated op. + if (BitProvenance[BitProvenance.size() - 1] == BitPart::Unset) { + while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset) + BitProvenance = BitProvenance.drop_back(); + if (BitProvenance.empty()) + return false; // TODO - handle null value? + DemandedTy = IntegerType::get(I->getContext(), BitProvenance.size()); + } + // Now, is the bit permutation correct for a bswap or a bitreverse? We can // only byteswap values with an even number of bytes. + unsigned DemandedBW = DemandedTy->getBitWidth(); bool OKForBSwap = DemandedBW % 16 == 0, OKForBitReverse = true; for (unsigned i = 0; i < DemandedBW; ++i) { OKForBSwap &= diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll index 41d3c5b58c2f46..5f9a8078f5415c 100644 --- a/llvm/test/Transforms/InstCombine/bswap.ll +++ b/llvm/test/Transforms/InstCombine/bswap.ll @@ -187,8 +187,8 @@ define i32 @bswap32_shl_first_extra_use(i32 %x) { define i16 @test8(i16 %a) { ; CHECK-LABEL: @test8( -; CHECK-NEXT: [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) -; CHECK-NEXT: ret i16 [[REV]] +; CHECK-NEXT: [[OR:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: ret i16 [[OR]] ; %conv = zext i16 %a to i32 %shr = lshr i16 %a, 8 @@ -201,8 +201,8 @@ define i16 @test8(i16 %a) { define i16 @test9(i16 %a) { ; CHECK-LABEL: @test9( -; CHECK-NEXT: [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) -; CHECK-NEXT: ret i16 [[REV]] +; CHECK-NEXT: [[OR:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]]) +; CHECK-NEXT: ret i16 [[OR]] ; %conv = zext i16 %a to i32 %shr = lshr i32 %conv, 8 @@ -229,18 +229,10 @@ define i16 @test10(i32 %a) { define i64 @PR39793_bswap_u64_as_u32(i64 %0) { ; CHECK-LABEL: @PR39793_bswap_u64_as_u32( -; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0:%.*]], 24 -; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 255 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65280 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[TMP7]], 16711680 -; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP0]], 24 -; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 4278190080 -; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] -; CHECK-NEXT: ret i64 [[TMP12]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[REV:%.*]] = call i32 @llvm.bswap.i32(i32 [[TRUNC]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[REV]] to i64 +; CHECK-NEXT: ret i64 [[TMP2]] ; %2 = lshr i64 %0, 24 %3 = and i64 %2, 255 @@ -258,13 +250,10 @@ define i64 @PR39793_bswap_u64_as_u32(i64 %0) { define i16 @PR39793_bswap_u64_as_u32_trunc(i64 %0) { ; CHECK-LABEL: @PR39793_bswap_u64_as_u32_trunc( -; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0:%.*]], 24 -; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 255 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65280 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i16 -; CHECK-NEXT: ret i16 [[TMP7]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[TMP0:%.*]] to i32 +; CHECK-NEXT: [[REV:%.*]] = call i32 @llvm.bswap.i32(i32 [[TRUNC]]) +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[REV]] to i16 +; CHECK-NEXT: ret i16 [[TMP2]] ; %2 = lshr i64 %0, 24 %3 = and i64 %2, 255 @@ -283,12 +272,10 @@ define i16 @PR39793_bswap_u64_as_u32_trunc(i64 %0) { define i64 @PR39793_bswap_u64_as_u16(i64 %0) { ; CHECK-LABEL: @PR39793_bswap_u64_as_u16( -; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0:%.*]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 255 -; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP0]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = and i64 [[TMP4]], 65280 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] -; CHECK-NEXT: ret i64 [[TMP6]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i64 [[TMP0:%.*]] to i16 +; CHECK-NEXT: [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[REV]] to i64 +; CHECK-NEXT: ret i64 [[TMP2]] ; %2 = lshr i64 %0, 8 %3 = and i64 %2, 255 @@ -300,9 +287,9 @@ define i64 @PR39793_bswap_u64_as_u16(i64 %0) { define i8 @PR39793_bswap_u64_as_u16_trunc(i64 %0) { ; CHECK-LABEL: @PR39793_bswap_u64_as_u16_trunc( -; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0:%.*]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i8 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[REV1:%.*]] = lshr i64 [[TMP0:%.*]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[REV1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; %2 = lshr i64 %0, 8 %3 = and i64 %2, 255 @@ -313,14 +300,27 @@ define i8 @PR39793_bswap_u64_as_u16_trunc(i64 %0) { ret i8 %7 } +define i50 @PR39793_bswap_u50_as_u16(i50 %0) { +; CHECK-LABEL: @PR39793_bswap_u50_as_u16( +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i50 [[TMP0:%.*]] to i16 +; CHECK-NEXT: [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[REV]] to i50 +; CHECK-NEXT: ret i50 [[TMP2]] +; + %2 = lshr i50 %0, 8 + %3 = and i50 %2, 255 + %4 = shl i50 %0, 8 + %5 = and i50 %4, 65280 + %6 = or i50 %3, %5 + ret i50 %6 +} + define i32 @PR39793_bswap_u32_as_u16(i32 %0) { ; CHECK-LABEL: @PR39793_bswap_u32_as_u16( -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = and i32 [[TMP2]], 255 -; CHECK-NEXT: [[TMP4:%.*]] = shl i32 [[TMP0]], 8 -; CHECK-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 65280 -; CHECK-NEXT: [[TMP6:%.*]] = or i32 [[TMP3]], [[TMP5]] -; CHECK-NEXT: ret i32 [[TMP6]] +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i32 [[TMP0:%.*]] to i16 +; CHECK-NEXT: [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[TRUNC]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[REV]] to i32 +; CHECK-NEXT: ret i32 [[TMP2]] ; %2 = lshr i32 %0, 8 %3 = and i32 %2, 255 @@ -332,9 +332,9 @@ define i32 @PR39793_bswap_u32_as_u16(i32 %0) { define i8 @PR39793_bswap_u32_as_u16_trunc(i32 %0) { ; CHECK-LABEL: @PR39793_bswap_u32_as_u16_trunc( -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0:%.*]], 8 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[REV1:%.*]] = lshr i32 [[TMP0:%.*]], 8 +; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[REV1]] to i8 +; CHECK-NEXT: ret i8 [[TMP2]] ; %2 = lshr i32 %0, 8 %3 = and i32 %2, 255 From 14088a6f5d1ae597960833a366beb9acee8d65cb Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Wed, 30 Sep 2020 07:42:43 +0000 Subject: [PATCH 5/8] [mlir] Added support for rank reducing subviews This commit adds support for subviews which enable to reduce resulting rank by dropping static dimensions of size 1. Differential Revision: https://reviews.llvm.org/D88534 --- .../mlir/Dialect/StandardOps/IR/Ops.td | 14 +++ mlir/lib/Dialect/StandardOps/IR/Ops.cpp | 99 ++++++++++++++++++- mlir/lib/Dialect/Vector/VectorTransforms.cpp | 3 - mlir/test/IR/core-ops.mlir | 29 ++++++ mlir/test/IR/invalid-ops.mlir | 10 ++ 5 files changed, 147 insertions(+), 8 deletions(-) diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td index 352b7d8fd3d69c..ff1a82c2656140 100644 --- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td +++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td @@ -2841,6 +2841,20 @@ def SubViewOp : Std_Op<"subview", [ "ArrayRef attrs = {}">, // Build a SubViewOp with all dynamic entries. OpBuilder< + "OpBuilder &b, OperationState &result, Value source, " + "ValueRange offsets, ValueRange sizes, ValueRange strides, " + "ArrayRef attrs = {}">, + // Build a SubViewOp with mixed static and dynamic entries + // and custom result type. + OpBuilder< + "OpBuilder &b, OperationState &result, MemRefType resultType, " + "Value source, ArrayRef staticOffsets, " + "ArrayRef staticSizes, ArrayRef staticStrides, " + "ValueRange offsets, ValueRange sizes, " + "ValueRange strides, ArrayRef attrs = {}">, + // Build a SubViewOp with all dynamic entries and custom result type. + OpBuilder< + "OpBuilder &b, OperationState &result, MemRefType resultType, " "Value source, ValueRange offsets, ValueRange sizes, ValueRange strides, " "ArrayRef attrs = {}"> ]; diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp index c0dc87210a3f1c..1cabf172b7fcc3 100644 --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -2728,15 +2728,47 @@ void mlir::SubViewOp::build(OpBuilder &b, OperationState &result, Value source, staticStridesVector, offsets, sizes, strides, attrs); } +/// Build a SubViewOp as above but with custom result type. +void mlir::SubViewOp::build(OpBuilder &b, OperationState &result, + MemRefType resultType, Value source, + ArrayRef staticOffsets, + ArrayRef staticSizes, + ArrayRef staticStrides, ValueRange offsets, + ValueRange sizes, ValueRange strides, + ArrayRef attrs) { + build(b, result, resultType, source, offsets, sizes, strides, + b.getI64ArrayAttr(staticOffsets), b.getI64ArrayAttr(staticSizes), + b.getI64ArrayAttr(staticStrides)); + result.addAttributes(attrs); +} + +/// Build a SubViewOp as above but with custom result type. +void mlir::SubViewOp::build(OpBuilder &b, OperationState &result, + MemRefType resultType, Value source, + ValueRange offsets, ValueRange sizes, + ValueRange strides, + ArrayRef attrs) { + auto sourceMemRefType = source.getType().cast(); + unsigned rank = sourceMemRefType.getRank(); + SmallVector staticOffsetsVector; + staticOffsetsVector.assign(rank, ShapedType::kDynamicStrideOrOffset); + SmallVector staticSizesVector; + staticSizesVector.assign(rank, ShapedType::kDynamicSize); + SmallVector staticStridesVector; + staticStridesVector.assign(rank, ShapedType::kDynamicStrideOrOffset); + build(b, result, resultType, source, staticOffsetsVector, staticSizesVector, + staticStridesVector, offsets, sizes, strides, attrs); +} + /// Verify that a particular offset/size/stride static attribute is well-formed. static LogicalResult verifySubViewOpPart(SubViewOp op, StringRef name, StringRef attrName, ArrayAttr attr, llvm::function_ref isDynamic, ValueRange values) { /// Check static and dynamic offsets/sizes/strides breakdown. - if (attr.size() != op.getRank()) - return op.emitError("expected ") - << op.getRank() << " " << name << " values"; + size_t inputRank = op.source().getType().cast().getRank(); + if (attr.size() != inputRank) + return op.emitError("expected ") << inputRank << " " << name << " values"; unsigned expectedNumDynamicEntries = llvm::count_if(attr.getValue(), [&](Attribute attr) { return isDynamic(attr.cast().getInt()); @@ -2755,6 +2787,62 @@ static SmallVector extractFromI64ArrayAttr(Attribute attr) { })); } +/// Checks if `original` MemRef type can be rank reduced to `reduced` type. +/// This function is slight variant of `is subsequence` algorithm where +/// not matching dimension must be 1. +static bool isRankReducedType(Type originalType, Type reducedType) { + if (originalType == reducedType) + return true; + + MemRefType original = originalType.cast(); + MemRefType reduced = reducedType.cast(); + ArrayRef originalShape = original.getShape(); + ArrayRef reducedShape = reduced.getShape(); + unsigned originalRank = originalShape.size(), + reducedRank = reducedShape.size(); + if (reducedRank > originalRank) + return false; + + unsigned reducedIdx = 0; + SmallVector keepMask(originalRank); + for (unsigned originalIdx = 0; originalIdx < originalRank; ++originalIdx) { + // -2 is never used as a dim size so it will never match. + int reducedVal = reducedIdx < reducedRank ? reducedShape[reducedIdx] : -2; + // Skip matching dims greedily. + if ((keepMask[originalIdx] = originalShape[originalIdx] == reducedVal)) + reducedIdx++; + // 1 is the only non-matching allowed. + else if (originalShape[originalIdx] != 1) + return false; + } + // Must match the reduced rank. + if (reducedIdx != reducedRank) + return false; + + MLIRContext *c = original.getContext(); + int64_t originalOffset, symCounter = 0, dimCounter = 0; + SmallVector originalStrides; + getStridesAndOffset(original, originalStrides, originalOffset); + auto getSymbolOrConstant = [&](int64_t offset) { + return offset == ShapedType::kDynamicStrideOrOffset + ? getAffineSymbolExpr(symCounter++, c) + : getAffineConstantExpr(offset, c); + }; + + AffineExpr expr = getSymbolOrConstant(originalOffset); + for (unsigned i = 0, e = originalStrides.size(); i < e; i++) { + if (keepMask[i]) + expr = expr + getSymbolOrConstant(originalStrides[i]) * + getAffineDimExpr(dimCounter++, c); + } + + auto reducedMap = AffineMap::get(dimCounter, symCounter, expr, c); + return original.getElementType() == reduced.getElementType() && + original.getMemorySpace() == reduced.getMemorySpace() && + (reduced.getAffineMaps().empty() || + reducedMap == reduced.getAffineMaps().front()); +} + /// Verifier for SubViewOp. static LogicalResult verify(SubViewOp op) { auto baseType = op.getBaseMemRefType().cast(); @@ -2790,8 +2878,9 @@ static LogicalResult verify(SubViewOp op) { op.getBaseMemRefType(), extractFromI64ArrayAttr(op.static_offsets()), extractFromI64ArrayAttr(op.static_sizes()), extractFromI64ArrayAttr(op.static_strides())); - if (op.getType() != expectedType) - return op.emitError("expected result type to be ") << expectedType; + if (!isRankReducedType(expectedType, subViewType)) + return op.emitError("expected result type to be ") + << expectedType << " or a rank-reduced version."; return success(); } diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp index 332bfbe2f4577e..5bf7857a66e8f6 100644 --- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp @@ -2107,9 +2107,6 @@ LogicalResult mlir::vector::splitFullAndPartialTransferPrecondition( // TODO: expand support to these 2 cases. if (!xferOp.permutation_map().isMinorIdentity()) return failure(); - // TODO: relax this precondition. This will require rank-reducing subviews. - if (xferOp.getMemRefType().getRank() != xferOp.getTransferRank()) - return failure(); // Must have some masked dimension to be a candidate for splitting. if (!xferOp.hasMaskedDim()) return failure(); diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir index f182936c870323..5e3959af29ddcb 100644 --- a/mlir/test/IR/core-ops.mlir +++ b/mlir/test/IR/core-ops.mlir @@ -19,6 +19,8 @@ // CHECK-DAG: #[[$SUBVIEW_MAP3:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2 + 8)> // CHECK-DAG: #[[$SUBVIEW_MAP4:map[0-9]+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> // CHECK-DAG: #[[$SUBVIEW_MAP5:map[0-9]+]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1 * 2)> +// CHECK-DAG: #[[$SUBVIEW_MAP6:map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0 * 36 + d1 * 36 + d2 * 4 + d3 * 4 + d4)> +// CHECK-DAG: #[[$SUBVIEW_MAP7:map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4 + d4 * s5 + d5 * s6)> // CHECK-LABEL: func @func_with_ops // CHECK-SAME: %[[ARG:.*]]: f32 @@ -797,6 +799,33 @@ func @memref_subview(%arg0 : index, %arg1 : index, %arg2 : index) { %11 = subview %9[%arg1, %arg2][4, 4][2, 2] : memref<16x4xf32> to memref<4x4xf32, offset: ?, strides:[8, 2]> + %12 = alloc() : memref<1x9x1x4x1xf32, affine_map<(d0, d1, d2, d3, d4) -> (36 * d0 + 36 * d1 + 4 * d2 + 4 * d3 + d4)>> + // CHECK: subview %12[%arg1, %arg1, %arg1, %arg1, %arg1] + // CHECK-SAME: [1, 9, 1, 4, 1] [%arg2, %arg2, %arg2, %arg2, %arg2] : + // CHECK-SAME: memref<1x9x1x4x1xf32, #[[$SUBVIEW_MAP6]]> to memref<9x4xf32, #[[$SUBVIEW_MAP2]]> + %13 = subview %12[%arg1, %arg1, %arg1, %arg1, %arg1][1, 9, 1, 4, 1][%arg2, %arg2, %arg2, %arg2, %arg2] : memref<1x9x1x4x1xf32, offset: 0, strides: [36, 36, 4, 4, 1]> to memref<9x4xf32, offset: ?, strides: [?, ?]> + // CHECK: subview %12[%arg1, %arg1, %arg1, %arg1, %arg1] + // CHECK-SAME: [1, 9, 1, 4, 1] [%arg2, %arg2, %arg2, %arg2, %arg2] : + // CHECK-SAME: memref<1x9x1x4x1xf32, #[[$SUBVIEW_MAP6]]> to memref<1x9x4xf32, #[[$BASE_MAP3]]> + %14 = subview %12[%arg1, %arg1, %arg1, %arg1, %arg1][1, 9, 1, 4, 1][%arg2, %arg2, %arg2, %arg2, %arg2] : memref<1x9x1x4x1xf32, offset: 0, strides: [36, 36, 4, 4, 1]> to memref<1x9x4xf32, offset: ?, strides: [?, ?, ?]> + + %15 = alloc(%arg1, %arg2)[%c0, %c1, %arg1, %arg0, %arg0, %arg2, %arg2] : memref<1x?x5x1x?x1xf32, affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6] -> (s0 + s1 * d0 + s2 * d1 + s3 * d2 + s4 * d3 + s5 * d4 + s6 * d5)>> + // CHECK: subview %15[0, 0, 0, 0, 0, 0] [1, %arg1, 5, 1, %arg2, 1] [1, 1, 1, 1, 1, 1] : + // CHECK-SAME: memref<1x?x5x1x?x1xf32, #[[$SUBVIEW_MAP7]]> to memref + %16 = subview %15[0, 0, 0, 0, 0, 0][1, %arg1, 5, 1, %arg2, 1][1, 1, 1, 1, 1, 1] : memref<1x?x5x1x?x1xf32, offset: ?, strides: [?, ?, ?, ?, ?, ?]> to memref + // CHECK: subview %15[%arg1, %arg1, %arg1, %arg1, %arg1, %arg1] [1, %arg1, 5, 1, %arg2, 1] [1, 1, 1, 1, 1, 1] : + // CHECK-SAME: memref<1x?x5x1x?x1xf32, #[[$SUBVIEW_MAP7]]> to memref + %17 = subview %15[%arg1, %arg1, %arg1, %arg1, %arg1, %arg1][1, %arg1, 5, 1, %arg2, 1][1, 1, 1, 1, 1, 1] : memref<1x?x5x1x?x1xf32, offset: ?, strides: [?, ?, ?, ?, ?, ?]> to memref + + %18 = alloc() : memref<1x8xf32> + // CHECK: subview %18[0, 0] [1, 8] [1, 1] : memref<1x8xf32> to memref<8xf32> + %19 = subview %18[0, 0][1, 8][1, 1] : memref<1x8xf32> to memref<8xf32> + + %20 = alloc() : memref<8x16x4xf32> + // CHECK: subview %20[0, 0, 0] [1, 16, 4] [1, 1, 1] : memref<8x16x4xf32> to memref<16x4xf32> + %21 = subview %20[0, 0, 0][1, 16, 4][1, 1, 1] : memref<8x16x4xf32> to memref<16x4xf32> + + %22 = subview %20[3, 4, 2][1, 6, 3][1, 1, 1] : memref<8x16x4xf32> to memref<6x3xf32, offset: 210, strides: [4, 1]> return } diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir index e02dbca494df61..ab18845bdb5324 100644 --- a/mlir/test/IR/invalid-ops.mlir +++ b/mlir/test/IR/invalid-ops.mlir @@ -1020,6 +1020,16 @@ func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) { // ----- +func @invalid_rank_reducing_subview(%arg0 : index, %arg1 : index, %arg2 : index) { + %0 = alloc() : memref<8x16x4xf32> + // expected-error@+1 {{expected result type to be 'memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>>'}} + %1 = subview %0[0, 0, 0][8, 16, 4][1, 1, 1] + : memref<8x16x4xf32> to memref<16x4xf32> + return +} + +// ----- + func @invalid_memref_cast(%arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]>) { // expected-error@+1{{operand type 'memref<12x4x16xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 16 + d2)>>' and result type 'memref<12x4x16xf32, affine_map<(d0, d1, d2) -> (d0 * 128 + d1 * 32 + d2 * 2)>>' are cast incompatible}} %0 = memref_cast %arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]> to memref<12x4x16xf32, offset:0, strides:[128, 32, 2]> From 3cbd01ddb9372b725dcea3dd5fed21ef5b3d9578 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Wed, 30 Sep 2020 12:14:39 +0100 Subject: [PATCH 6/8] [NFC][ARM] Add more LowOverheadLoop tests. --- .../biquad-cascade-default.mir | 396 ++++++++++++++++++ .../biquad-cascade-optsize-strd-lr.mir | 392 +++++++++++++++++ .../biquad-cascade-optsize.mir | 396 ++++++++++++++++++ 3 files changed, 1184 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-default.mir create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize-strd-lr.mir create mode 100644 llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize.mir diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-default.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-default.mir new file mode 100644 index 00000000000000..3c37c4a14b7179 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-default.mir @@ -0,0 +1,396 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + %struct.arm_biquad_casd_df1_inst_q31 = type { i32*, i32*, i32, i32 } + + define hidden void @arm_biquad_cascade_df1_q31(%struct.arm_biquad_casd_df1_inst_q31* nocapture readonly %arg, i32* nocapture readonly %arg1, i32* nocapture %arg2, i32 %arg3) { + bb: + %i = bitcast %struct.arm_biquad_casd_df1_inst_q31* %arg to i32** + %i4 = load i32*, i32** %i, align 4 + %i5 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 1 + %i6 = load i32*, i32** %i5, align 4 + %i7 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 2 + %i8 = load i32, i32* %i7, align 4 + %i9 = sub i32 31, %i8 + %i10 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 3 + %i11 = load i32, i32* %i10, align 4 + br label %bb12 + + bb12: ; preds = %bb74, %bb + %i13 = phi i32* [ %i6, %bb ], [ %i18, %bb74 ] + %i14 = phi i32* [ %i4, %bb ], [ %i85, %bb74 ] + %i15 = phi i32* [ %arg1, %bb ], [ %arg2, %bb74 ] + %i16 = phi i32 [ %i11, %bb ], [ %i89, %bb74 ] + %i18 = getelementptr inbounds i32, i32* %i13, i32 5 + %i19 = load i32, i32* %i14, align 4 + %i20 = getelementptr inbounds i32, i32* %i14, i32 1 + %i21 = load i32, i32* %i20, align 4 + %i22 = getelementptr inbounds i32, i32* %i14, i32 2 + %i23 = load i32, i32* %i22, align 4 + %i24 = getelementptr inbounds i32, i32* %i14, i32 3 + %i25 = load i32, i32* %i24, align 4 + %i26 = call i1 @llvm.test.set.loop.iterations.i32(i32 %arg3) + br i1 %i26, label %bb27, label %bb74 + + bb27: ; preds = %bb12 + %i28 = getelementptr inbounds i32, i32* %i13, i32 4 + %i29 = load i32, i32* %i28, align 4 + %i30 = getelementptr inbounds i32, i32* %i13, i32 3 + %i31 = load i32, i32* %i30, align 4 + %i32 = getelementptr inbounds i32, i32* %i13, i32 2 + %i33 = load i32, i32* %i32, align 4 + %i34 = getelementptr inbounds i32, i32* %i13, i32 1 + %i35 = load i32, i32* %i34, align 4 + %i36 = load i32, i32* %i13, align 4 + br label %bb37 + + bb37: ; preds = %bb37, %bb27 + %lsr.iv = phi i32 [ %lsr.iv.next, %bb37 ], [ %arg3, %bb27 ] + %i38 = phi i32* [ %i15, %bb27 ], [ %i51, %bb37 ] + %i39 = phi i32* [ %arg2, %bb27 ], [ %i69, %bb37 ] + %i40 = phi i32 [ %i25, %bb27 ], [ %i41, %bb37 ] + %i41 = phi i32 [ %i23, %bb27 ], [ %i68, %bb37 ] + %i42 = phi i32 [ %i21, %bb27 ], [ %i43, %bb37 ] + %i43 = phi i32 [ %i19, %bb27 ], [ %i52, %bb37 ] + %i45 = sext i32 %i29 to i64 + %i46 = sext i32 %i31 to i64 + %i47 = sext i32 %i33 to i64 + %i48 = sext i32 %i35 to i64 + %i49 = sext i32 %i36 to i64 + %i50 = zext i32 %i9 to i64 + %i51 = getelementptr inbounds i32, i32* %i38, i32 1 + %i52 = load i32, i32* %i38, align 4 + %i53 = sext i32 %i52 to i64 + %i54 = mul nsw i64 %i53, %i49 + %i55 = sext i32 %i43 to i64 + %i56 = mul nsw i64 %i55, %i48 + %i57 = sext i32 %i42 to i64 + %i58 = mul nsw i64 %i57, %i47 + %i59 = sext i32 %i41 to i64 + %i60 = mul nsw i64 %i59, %i46 + %i61 = sext i32 %i40 to i64 + %i62 = mul nsw i64 %i61, %i45 + %i63 = add i64 %i58, %i56 + %i64 = add i64 %i63, %i60 + %i65 = add i64 %i64, %i62 + %i66 = add i64 %i65, %i54 + %i67 = ashr i64 %i66, %i50 + %i68 = trunc i64 %i67 to i32 + %i69 = getelementptr inbounds i32, i32* %i39, i32 1 + store i32 %i68, i32* %i39, align 4 + %i70 = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) + %i71 = icmp ne i32 %i70, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %i71, label %bb37, label %bb72 + + bb72: ; preds = %bb37 + %i73 = trunc i64 %i67 to i32 + br label %bb74 + + bb74: ; preds = %bb72, %bb12 + %i75 = phi i32 [ %i19, %bb12 ], [ %i52, %bb72 ] + %i76 = phi i32 [ %i21, %bb12 ], [ %i43, %bb72 ] + %i77 = phi i32 [ %i23, %bb12 ], [ %i73, %bb72 ] + %i78 = phi i32 [ %i25, %bb12 ], [ %i41, %bb72 ] + store i32 %i75, i32* %i14, align 4 + %i79 = bitcast i32* %i14 to i8* + %i80 = getelementptr inbounds i8, i8* %i79, i32 4 + %i81 = bitcast i8* %i80 to i32* + store i32 %i76, i32* %i81, align 4 + %i82 = bitcast i32* %i14 to i8* + %i83 = getelementptr inbounds i8, i8* %i82, i32 8 + %i84 = bitcast i8* %i83 to i32* + store i32 %i77, i32* %i84, align 4 + %i85 = getelementptr inbounds i32, i32* %i14, i32 4 + %i86 = bitcast i32* %i14 to i8* + %i87 = getelementptr inbounds i8, i8* %i86, i32 12 + %i88 = bitcast i8* %i87 to i32* + store i32 %i78, i32* %i88, align 4 + %i89 = add i32 %i16, -1 + %i90 = icmp eq i32 %i89, 0 + br i1 %i90, label %bb91, label %bb12 + + bb91: ; preds = %bb74 + ret void + } + + declare i1 @llvm.test.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32(i32, i32) + +... +--- +name: arm_biquad_cascade_df1_q31 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 76 + offsetAdjustment: 0 + maxAlignment: 4 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -40, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -44, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -48, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -52, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: spill-slot, offset: -56, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: -60, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: -64, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: -68, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 8, name: '', type: spill-slot, offset: -72, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 9, name: '', type: spill-slot, offset: -76, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 10, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 11, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r11', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 12, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r10', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 13, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r9', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 14, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r8', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 15, name: '', type: spill-slot, offset: -24, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 16, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 17, name: '', type: spill-slot, offset: -32, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 18, name: '', type: spill-slot, offset: -36, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: arm_biquad_cascade_df1_q31 + ; CHECK: bb.0.bb: + ; CHECK: successors: %bb.2(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 36 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r11, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r10, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r9, -16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -20 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -24 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -28 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -32 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -36 + ; CHECK: $sp = frame-setup tSUBspi $sp, 10, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 76 + ; CHECK: $r6, $r5 = t2LDRDi8 $r0, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i7), (load 4 from %ir.i10) + ; CHECK: $r8 = tMOVr killed $r3, 14 /* CC::al */, $noreg + ; CHECK: $r3, $r7 = t2LDRDi8 killed $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i), (load 4 from %ir.i5) + ; CHECK: renamable $r0 = t2RSBri killed renamable $r6, 31, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2STMIA $sp, 14 /* CC::al */, $noreg, killed $r0, $r2, $r8 :: (store 4 into %stack.9), (store 4 into %stack.8), (store 4 into %stack.7) + ; CHECK: $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = tLDRspi $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.9) + ; CHECK: tB %bb.2, 14 /* CC::al */, $noreg + ; CHECK: bb.1.bb74 (align 4): + ; CHECK: successors: %bb.6(0x04000000), %bb.2(0x7c000000) + ; CHECK: liveins: $r0, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r2 + ; CHECK: renamable $r7, dead $cpsr = nuw tADDi8 killed renamable $r7, 20, 14 /* CC::al */, $noreg + ; CHECK: t2STRDi8 killed $r9, killed $r4, $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.i14), (store 4 into %ir.i81) + ; CHECK: t2STRDi8 killed $r6, killed $r0, $r3, 8, 14 /* CC::al */, $noreg :: (store 4 into %ir.i84), (store 4 into %ir.i88) + ; CHECK: renamable $r3, dead $cpsr = nuw tADDi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg + ; CHECK: renamable $r5, $cpsr = tSUBi8 killed renamable $r5, 1, 14 /* CC::al */, $noreg + ; CHECK: $r1 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: tBcc %bb.6, 0 /* CC::eq */, killed $cpsr + ; CHECK: bb.2.bb12: + ; CHECK: successors: %bb.3(0x40000000), %bb.1(0x40000000) + ; CHECK: liveins: $r1, $r2, $r3, $r5, $r7, $r8, $r12 + ; CHECK: $r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20) + ; CHECK: $r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24) + ; CHECK: t2CMPri renamable $r8, 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: tBcc %bb.1, 0 /* CC::eq */, killed $cpsr + ; CHECK: tB %bb.3, 14 /* CC::al */, $noreg + ; CHECK: bb.3.bb27: + ; CHECK: successors: %bb.4(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12 + ; CHECK: t2STRDi8 killed $r3, killed $r5, $sp, 12, 14 /* CC::al */, $noreg :: (store 4 into %stack.6), (store 4 into %stack.5) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i13) + ; CHECK: tSTRspi killed renamable $r3, $sp, 9, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.i34) + ; CHECK: tSTRspi killed renamable $r3, $sp, 8, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.i32) + ; CHECK: tSTRspi killed renamable $r3, $sp, 7, 14 /* CC::al */, $noreg :: (store 4 into %stack.2) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 3, 14 /* CC::al */, $noreg :: (load 4 from %ir.i30) + ; CHECK: t2STRDi8 $r7, killed $r3, $sp, 20, 14 /* CC::al */, $noreg :: (store 4 into %stack.4), (store 4 into %stack.3) + ; CHECK: renamable $r10 = t2LDRi12 killed renamable $r7, 16, 14 /* CC::al */, $noreg :: (load 4 from %ir.i28) + ; CHECK: bb.4.bb37 (align 4): + ; CHECK: successors: %bb.4(0x7c000000), %bb.5(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r4, $r6, $r8, $r9, $r10, $r12 + ; CHECK: $r7 = tMOVr killed $r6, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6 = tLDRspi $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) + ; CHECK: renamable $r3 = tLDRspi $sp, 7, 14 /* CC::al */, $noreg :: (load 4 from %stack.2) + ; CHECK: renamable $r6, renamable $r11 = t2SMULL $r9, killed renamable $r6, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL killed renamable $r4, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = tLDRspi $sp, 6, 14 /* CC::al */, $noreg :: (load 4 from %stack.3) + ; CHECK: $r5 = tMOVr killed $r9, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL renamable $r7, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r9, renamable $r1 = t2LDR_POST killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.i38) + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL killed renamable $r0, renamable $r10, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) + ; CHECK: $lr = tMOVr $r8, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL renamable $r9, killed renamable $r0, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r6, dead early-clobber renamable $r11 = MVE_ASRLr killed renamable $r6, killed renamable $r11, renamable $r2, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r12 = t2STR_POST renamable $r6, killed renamable $r12, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.i39) + ; CHECK: dead $lr = t2SUBri killed renamable $lr, 1, 14 /* CC::al */, $noreg, def $cpsr + ; CHECK: renamable $r8 = t2SUBri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r0 = tMOVr $r7, 14 /* CC::al */, $noreg + ; CHECK: $r4 = tMOVr $r5, 14 /* CC::al */, $noreg + ; CHECK: tBcc %bb.4, 1 /* CC::ne */, killed $cpsr + ; CHECK: tB %bb.5, 14 /* CC::al */, $noreg + ; CHECK: bb.5.bb72: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r2, $r5, $r6, $r7, $r9 + ; CHECK: $r0 = tMOVr killed $r7, 14 /* CC::al */, $noreg + ; CHECK: $r7 = tADDrSPi $sp, 3, 14 /* CC::al */, $noreg + ; CHECK: $r4 = tMOVr killed $r5, 14 /* CC::al */, $noreg + ; CHECK: $r12, $r8 = t2LDRDi8 $sp, 4, 14 /* CC::al */, $noreg :: (load 4 from %stack.8), (load 4 from %stack.7) + ; CHECK: tLDMIA killed $r7, 14 /* CC::al */, $noreg, def $r3, def $r5, def $r7 :: (load 4 from %stack.6), (load 4 from %stack.5), (load 4 from %stack.4) + ; CHECK: tB %bb.1, 14 /* CC::al */, $noreg + ; CHECK: bb.6.bb91: + ; CHECK: $sp = frame-destroy tADDspi $sp, 10, 14 /* CC::al */, $noreg + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc + bb.0.bb: + successors: %bb.2(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 36 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r11, -8 + frame-setup CFI_INSTRUCTION offset $r10, -12 + frame-setup CFI_INSTRUCTION offset $r9, -16 + frame-setup CFI_INSTRUCTION offset $r8, -20 + frame-setup CFI_INSTRUCTION offset $r7, -24 + frame-setup CFI_INSTRUCTION offset $r6, -28 + frame-setup CFI_INSTRUCTION offset $r5, -32 + frame-setup CFI_INSTRUCTION offset $r4, -36 + $sp = frame-setup tSUBspi $sp, 10, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_offset 76 + $r6, $r5 = t2LDRDi8 $r0, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i7), (load 4 from %ir.i10) + $r8 = tMOVr killed $r3, 14 /* CC::al */, $noreg + $r3, $r7 = t2LDRDi8 killed $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i), (load 4 from %ir.i5) + renamable $r0 = t2RSBri killed renamable $r6, 31, 14 /* CC::al */, $noreg, $noreg + t2STMIA $sp, 14 /* CC::al */, $noreg, killed $r0, $r2, $r8 :: (store 4 into %stack.9), (store 4 into %stack.8), (store 4 into %stack.7) + $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg + renamable $r2 = tLDRspi $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.9) + tB %bb.2, 14 /* CC::al */, $noreg + + bb.1.bb74 (align 4): + successors: %bb.6(0x04000000), %bb.2(0x7c000000) + liveins: $r0, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r2 + + renamable $r7, dead $cpsr = nuw tADDi8 killed renamable $r7, 20, 14 /* CC::al */, $noreg + t2STRDi8 killed $r9, killed $r4, $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.i14), (store 4 into %ir.i81) + t2STRDi8 killed $r6, killed $r0, $r3, 8, 14 /* CC::al */, $noreg :: (store 4 into %ir.i84), (store 4 into %ir.i88) + renamable $r3, dead $cpsr = nuw tADDi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg + renamable $r5, $cpsr = tSUBi8 killed renamable $r5, 1, 14 /* CC::al */, $noreg + $r1 = tMOVr $r12, 14 /* CC::al */, $noreg + tBcc %bb.6, 0 /* CC::eq */, killed $cpsr + + bb.2.bb12: + successors: %bb.3(0x40000000), %bb.1(0x40000000) + liveins: $r1, $r3, $r5, $r7, $r8, $r12, $r2 + + $r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20) + $r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24) + t2WhileLoopStart renamable $r8, %bb.1, implicit-def dead $cpsr + tB %bb.3, 14 /* CC::al */, $noreg + + bb.3.bb27: + successors: %bb.4(0x80000000) + liveins: $r0, $r1, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r2 + + t2STRDi8 killed $r3, killed $r5, $sp, 12, 14 /* CC::al */, $noreg :: (store 4 into %stack.6), (store 4 into %stack.5) + renamable $r3 = tLDRi renamable $r7, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i13) + tSTRspi killed renamable $r3, $sp, 9, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) + renamable $r3 = tLDRi renamable $r7, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.i34) + tSTRspi killed renamable $r3, $sp, 8, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) + renamable $r3 = tLDRi renamable $r7, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.i32) + tSTRspi killed renamable $r3, $sp, 7, 14 /* CC::al */, $noreg :: (store 4 into %stack.2) + renamable $r3 = tLDRi renamable $r7, 3, 14 /* CC::al */, $noreg :: (load 4 from %ir.i30) + t2STRDi8 $r7, killed $r3, $sp, 20, 14 /* CC::al */, $noreg :: (store 4 into %stack.4), (store 4 into %stack.3) + renamable $r10 = t2LDRi12 killed renamable $r7, 16, 14 /* CC::al */, $noreg :: (load 4 from %ir.i28) + + bb.4.bb37 (align 4): + successors: %bb.4(0x7c000000), %bb.5(0x04000000) + liveins: $r0, $r1, $r2, $r4, $r6, $r8, $r9, $r10, $r12 + + $r7 = tMOVr killed $r6, 14 /* CC::al */, $noreg + renamable $r6 = tLDRspi $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) + renamable $r3 = tLDRspi $sp, 7, 14 /* CC::al */, $noreg :: (load 4 from %stack.2) + renamable $r6, renamable $r11 = t2SMULL $r9, killed renamable $r6, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL killed renamable $r4, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r3 = tLDRspi $sp, 6, 14 /* CC::al */, $noreg :: (load 4 from %stack.3) + $r5 = tMOVr killed $r9, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL renamable $r7, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r9, renamable $r1 = t2LDR_POST killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.i38) + renamable $r6, renamable $r11 = t2SMLAL killed renamable $r0, renamable $r10, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r0 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) + $lr = tMOVr $r8, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL renamable $r9, killed renamable $r0, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + early-clobber renamable $r6, dead early-clobber renamable $r11 = MVE_ASRLr killed renamable $r6, killed renamable $r11, renamable $r2, 14 /* CC::al */, $noreg + early-clobber renamable $r12 = t2STR_POST renamable $r6, killed renamable $r12, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.i39) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r8 = t2SUBri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg + $r0 = tMOVr $r7, 14 /* CC::al */, $noreg + $r4 = tMOVr $r5, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.4, implicit-def dead $cpsr + tB %bb.5, 14 /* CC::al */, $noreg + + bb.5.bb72: + successors: %bb.1(0x80000000) + liveins: $r5, $r6, $r7, $r9, $r2 + + $r0 = tMOVr killed $r7, 14 /* CC::al */, $noreg + $r7 = tADDrSPi $sp, 3, 14 /* CC::al */, $noreg + $r4 = tMOVr killed $r5, 14 /* CC::al */, $noreg + $r12, $r8 = t2LDRDi8 $sp, 4, 14 /* CC::al */, $noreg :: (load 4 from %stack.8), (load 4 from %stack.7) + tLDMIA killed $r7, 14 /* CC::al */, $noreg, def $r3, def $r5, def $r7 :: (load 4 from %stack.6), (load 4 from %stack.5), (load 4 from %stack.4) + tB %bb.1, 14 /* CC::al */, $noreg + + bb.6.bb91: + $sp = frame-destroy tADDspi $sp, 10, 14 /* CC::al */, $noreg + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize-strd-lr.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize-strd-lr.mir new file mode 100644 index 00000000000000..a847b69c26143b --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize-strd-lr.mir @@ -0,0 +1,392 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - -verify-machineinstrs | FileCheck %s +--- | + %struct.arm_biquad_casd_df1_inst_q31 = type { i32*, i32*, i32, i32 } + + ; Function Attrs: optsize + define hidden void @arm_biquad_cascade_df1_q31(%struct.arm_biquad_casd_df1_inst_q31* nocapture readonly %arg, i32* nocapture readonly %arg1, i32* nocapture %arg2, i32 %arg3) #0 { + bb: + %i = bitcast %struct.arm_biquad_casd_df1_inst_q31* %arg to i32** + %i4 = load i32*, i32** %i, align 4 + %i5 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 1 + %i6 = load i32*, i32** %i5, align 4 + %i7 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 2 + %i8 = load i32, i32* %i7, align 4 + %i9 = sub i32 31, %i8 + %i10 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 3 + %i11 = load i32, i32* %i10, align 4 + br label %bb12 + + bb12: ; preds = %bb74, %bb + %i13 = phi i32* [ %i6, %bb ], [ %i18, %bb74 ] + %i14 = phi i32* [ %i4, %bb ], [ %i85, %bb74 ] + %i15 = phi i32* [ %arg1, %bb ], [ %arg2, %bb74 ] + %i16 = phi i32 [ %i11, %bb ], [ %i89, %bb74 ] + %i18 = getelementptr inbounds i32, i32* %i13, i32 5 + %i19 = load i32, i32* %i14, align 4 + %i20 = getelementptr inbounds i32, i32* %i14, i32 1 + %i21 = load i32, i32* %i20, align 4 + %i22 = getelementptr inbounds i32, i32* %i14, i32 2 + %i23 = load i32, i32* %i22, align 4 + %i24 = getelementptr inbounds i32, i32* %i14, i32 3 + %i25 = load i32, i32* %i24, align 4 + %i26 = call i1 @llvm.test.set.loop.iterations.i32(i32 %arg3) + br i1 %i26, label %bb27, label %bb74 + + bb27: ; preds = %bb12 + %i28 = getelementptr inbounds i32, i32* %i13, i32 4 + %i29 = load i32, i32* %i28, align 4 + %i30 = getelementptr inbounds i32, i32* %i13, i32 3 + %i31 = load i32, i32* %i30, align 4 + %i32 = getelementptr inbounds i32, i32* %i13, i32 2 + %i33 = load i32, i32* %i32, align 4 + %i34 = getelementptr inbounds i32, i32* %i13, i32 1 + %i35 = load i32, i32* %i34, align 4 + %i36 = load i32, i32* %i13, align 4 + br label %bb37 + + bb37: ; preds = %bb37, %bb27 + %lsr.iv = phi i32 [ %lsr.iv.next, %bb37 ], [ %arg3, %bb27 ] + %i38 = phi i32* [ %i15, %bb27 ], [ %i51, %bb37 ] + %i39 = phi i32* [ %arg2, %bb27 ], [ %i69, %bb37 ] + %i40 = phi i32 [ %i25, %bb27 ], [ %i41, %bb37 ] + %i41 = phi i32 [ %i23, %bb27 ], [ %i68, %bb37 ] + %i42 = phi i32 [ %i21, %bb27 ], [ %i43, %bb37 ] + %i43 = phi i32 [ %i19, %bb27 ], [ %i52, %bb37 ] + %i45 = sext i32 %i29 to i64 + %i46 = sext i32 %i31 to i64 + %i47 = sext i32 %i33 to i64 + %i48 = sext i32 %i35 to i64 + %i49 = sext i32 %i36 to i64 + %i50 = zext i32 %i9 to i64 + %i51 = getelementptr inbounds i32, i32* %i38, i32 1 + %i52 = load i32, i32* %i38, align 4 + %i53 = sext i32 %i52 to i64 + %i54 = mul nsw i64 %i53, %i49 + %i55 = sext i32 %i43 to i64 + %i56 = mul nsw i64 %i55, %i48 + %i57 = sext i32 %i42 to i64 + %i58 = mul nsw i64 %i57, %i47 + %i59 = sext i32 %i41 to i64 + %i60 = mul nsw i64 %i59, %i46 + %i61 = sext i32 %i40 to i64 + %i62 = mul nsw i64 %i61, %i45 + %i63 = add i64 %i58, %i56 + %i64 = add i64 %i63, %i60 + %i65 = add i64 %i64, %i62 + %i66 = add i64 %i65, %i54 + %i67 = ashr i64 %i66, %i50 + %i68 = trunc i64 %i67 to i32 + %i69 = getelementptr inbounds i32, i32* %i39, i32 1 + store i32 %i68, i32* %i39, align 4 + %i70 = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) + %i71 = icmp ne i32 %i70, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %i71, label %bb37, label %bb72 + + bb72: ; preds = %bb37 + %i73 = trunc i64 %i67 to i32 + br label %bb74 + + bb74: ; preds = %bb72, %bb12 + %i75 = phi i32 [ %i19, %bb12 ], [ %i52, %bb72 ] + %i76 = phi i32 [ %i21, %bb12 ], [ %i43, %bb72 ] + %i77 = phi i32 [ %i23, %bb12 ], [ %i73, %bb72 ] + %i78 = phi i32 [ %i25, %bb12 ], [ %i41, %bb72 ] + store i32 %i75, i32* %i14, align 4 + %i79 = bitcast i32* %i14 to i8* + %i80 = getelementptr inbounds i8, i8* %i79, i32 4 + %i81 = bitcast i8* %i80 to i32* + store i32 %i76, i32* %i81, align 4 + %i82 = bitcast i32* %i14 to i8* + %i83 = getelementptr inbounds i8, i8* %i82, i32 8 + %i84 = bitcast i8* %i83 to i32* + store i32 %i77, i32* %i84, align 4 + %i85 = getelementptr inbounds i32, i32* %i14, i32 4 + %i86 = bitcast i32* %i14 to i8* + %i87 = getelementptr inbounds i8, i8* %i86, i32 12 + %i88 = bitcast i8* %i87 to i32* + store i32 %i78, i32* %i88, align 4 + %i89 = add i32 %i16, -1 + %i90 = icmp eq i32 %i89, 0 + br i1 %i90, label %bb91, label %bb12 + + bb91: ; preds = %bb74 + ret void + } + + declare i1 @llvm.test.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #1 + + attributes #0 = { optsize "target-cpu"="cortex-m55" } + attributes #1 = { noduplicate nounwind "target-cpu"="cortex-m55" } + +... +--- +name: arm_biquad_cascade_df1_q31 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 76 + offsetAdjustment: 0 + maxAlignment: 4 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -40, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -44, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -48, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -52, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: spill-slot, offset: -56, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: -60, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: -64, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: -68, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 8, name: '', type: spill-slot, offset: -72, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 9, name: '', type: spill-slot, offset: -76, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 10, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 11, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r11', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 12, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r10', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 13, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r9', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 14, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r8', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 15, name: '', type: spill-slot, offset: -24, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 16, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 17, name: '', type: spill-slot, offset: -32, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 18, name: '', type: spill-slot, offset: -36, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: arm_biquad_cascade_df1_q31 + ; CHECK: bb.0.bb: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 36 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r11, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r10, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r9, -16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -20 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -24 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -28 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -32 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -36 + ; CHECK: $sp = frame-setup tSUBspi $sp, 10, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 76 + ; CHECK: $r7, $r5 = t2LDRDi8 $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i), (load 4 from %ir.i5) + ; CHECK: $r6, $r4 = t2LDRDi8 killed $r0, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i7), (load 4 from %ir.i10) + ; CHECK: renamable $r0 = t2RSBri killed renamable $r6, 31, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2STMIA $sp, 14 /* CC::al */, $noreg, killed $r0, $r2, $r3 :: (store 4 into %stack.9), (store 4 into %stack.8), (store 4 into %stack.7) + ; CHECK: bb.1.bb12 (align 4): + ; CHECK: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK: liveins: $r1, $r2, $r3, $r4, $r5, $r7 + ; CHECK: $r9, $r8 = t2LDRDi8 $r7, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20) + ; CHECK: dead renamable $lr = nuw t2ADDri renamable $r5, 20, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r6, $r12 = t2LDRDi8 $r7, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24) + ; CHECK: $lr = t2WLS renamable $r3, %bb.5 + ; CHECK: bb.2.bb27: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $lr, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12 + ; CHECK: t2STRDi8 killed $lr, killed $r7, $sp, 12, 14 /* CC::al */, $noreg :: (store 4 into %stack.6), (store 4 into %stack.5) + ; CHECK: renamable $r0 = tLDRi renamable $r5, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i13) + ; CHECK: renamable $r10 = t2LDRi12 renamable $r5, 16, 14 /* CC::al */, $noreg :: (load 4 from %ir.i28) + ; CHECK: tSTRspi killed renamable $r0, $sp, 9, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) + ; CHECK: renamable $r0 = tLDRi renamable $r5, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.i34) + ; CHECK: tSTRspi killed renamable $r4, $sp, 5, 14 /* CC::al */, $noreg :: (store 4 into %stack.4) + ; CHECK: tSTRspi killed renamable $r0, $sp, 8, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) + ; CHECK: renamable $r0 = tLDRi renamable $r5, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.i32) + ; CHECK: tSTRspi killed renamable $r0, $sp, 7, 14 /* CC::al */, $noreg :: (store 4 into %stack.2) + ; CHECK: renamable $r0 = tLDRi killed renamable $r5, 3, 14 /* CC::al */, $noreg :: (load 4 from %ir.i30) + ; CHECK: tSTRspi killed renamable $r0, $sp, 6, 14 /* CC::al */, $noreg :: (store 4 into %stack.3) + ; CHECK: $r0 = tMOVr killed $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = tLDRspi $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.9) + ; CHECK: bb.3.bb37 (align 4): + ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r6, $r8, $r9, $r10, $r12 + ; CHECK: renamable $r4 = tLDRspi $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) + ; CHECK: $r7 = tMOVr killed $r6, 14 /* CC::al */, $noreg + ; CHECK: $r5 = tMOVr $r9, 14 /* CC::al */, $noreg + ; CHECK: $lr = tMOVr $r0, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMULL killed $r9, killed renamable $r4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = tLDRspi $sp, 7, 14 /* CC::al */, $noreg :: (load 4 from %stack.2) + ; CHECK: renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r9, renamable $r1 = t2LDR_POST killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.i38) + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL killed renamable $r8, killed renamable $r4, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = tLDRspi $sp, 6, 14 /* CC::al */, $noreg :: (load 4 from %stack.3) + ; CHECK: $r8 = tMOVr $r5, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL renamable $r7, killed renamable $r4, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL killed renamable $r12, renamable $r10, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: $r12 = tMOVr $r7, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL renamable $r9, killed renamable $r4, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r6, dead early-clobber renamable $r11 = MVE_ASRLr killed renamable $r6, killed renamable $r11, renamable $r3, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r2 = t2STR_POST renamable $r6, killed renamable $r2, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.i39) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.3 + ; CHECK: bb.4.bb72: + ; CHECK: successors: %bb.5(0x80000000) + ; CHECK: liveins: $r5, $r6, $r7, $r9 + ; CHECK: $r12 = tMOVr killed $r7, 14 /* CC::al */, $noreg + ; CHECK: $r7, $r4 = t2LDRDi8 $sp, 16, 14 /* CC::al */, $noreg :: (load 4 from %stack.5), (load 4 from %stack.4) + ; CHECK: $lr = t2ADDri $sp, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r8 = tMOVr killed $r5, 14 /* CC::al */, $noreg + ; CHECK: t2LDMIA killed $lr, 14 /* CC::al */, $noreg, def $r2, def $r3, def $lr :: (load 4 from %stack.8), (load 4 from %stack.7), (load 4 from %stack.6) + ; CHECK: bb.5.bb74: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r2, $r3, $r4, $r6, $r7, $r8, $r9, $r12 + ; CHECK: t2STRDi8 killed $r9, killed $r8, $r7, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.i14), (store 4 into %ir.i81) + ; CHECK: t2STRDi8 killed $r6, killed $r12, $r7, 8, 14 /* CC::al */, $noreg :: (store 4 into %ir.i84), (store 4 into %ir.i88) + ; CHECK: renamable $r7, dead $cpsr = nuw tADDi8 killed renamable $r7, 16, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, $cpsr = tSUBi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg + ; CHECK: $r5 = tMOVr killed $lr, 14 /* CC::al */, $noreg + ; CHECK: $r1 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: t2IT 0, 4, implicit-def $itstate + ; CHECK: $sp = frame-destroy tADDspi $sp, 10, 0 /* CC::eq */, $cpsr, implicit $itstate + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 0 /* CC::eq */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc, implicit $sp, implicit killed $r4, implicit killed $r5, implicit killed $r7, implicit killed $itstate + ; CHECK: tB %bb.1, 14 /* CC::al */, $noreg + bb.0.bb: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 36 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r11, -8 + frame-setup CFI_INSTRUCTION offset $r10, -12 + frame-setup CFI_INSTRUCTION offset $r9, -16 + frame-setup CFI_INSTRUCTION offset $r8, -20 + frame-setup CFI_INSTRUCTION offset $r7, -24 + frame-setup CFI_INSTRUCTION offset $r6, -28 + frame-setup CFI_INSTRUCTION offset $r5, -32 + frame-setup CFI_INSTRUCTION offset $r4, -36 + $sp = frame-setup tSUBspi $sp, 10, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_offset 76 + $r7, $r5 = t2LDRDi8 $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i), (load 4 from %ir.i5) + $r6, $r4 = t2LDRDi8 killed $r0, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i7), (load 4 from %ir.i10) + renamable $r0 = t2RSBri killed renamable $r6, 31, 14 /* CC::al */, $noreg, $noreg + t2STMIA $sp, 14 /* CC::al */, $noreg, killed $r0, $r2, $r3 :: (store 4 into %stack.9), (store 4 into %stack.8), (store 4 into %stack.7) + + bb.1.bb12 (align 4): + successors: %bb.2(0x40000000), %bb.5(0x40000000) + liveins: $r1, $r2, $r3, $r4, $r5, $r7 + + $r9, $r8 = t2LDRDi8 $r7, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20) + renamable $lr = nuw t2ADDri renamable $r5, 20, 14 /* CC::al */, $noreg, $noreg + $r6, $r12 = t2LDRDi8 $r7, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24) + t2WhileLoopStart renamable $r3, %bb.5, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.bb27: + successors: %bb.3(0x80000000) + liveins: $lr, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12 + + t2STRDi8 killed $lr, killed $r7, $sp, 12, 14 /* CC::al */, $noreg :: (store 4 into %stack.6), (store 4 into %stack.5) + renamable $r0 = tLDRi renamable $r5, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i13) + renamable $r10 = t2LDRi12 renamable $r5, 16, 14 /* CC::al */, $noreg :: (load 4 from %ir.i28) + tSTRspi killed renamable $r0, $sp, 9, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) + renamable $r0 = tLDRi renamable $r5, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.i34) + tSTRspi killed renamable $r4, $sp, 5, 14 /* CC::al */, $noreg :: (store 4 into %stack.4) + tSTRspi killed renamable $r0, $sp, 8, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) + renamable $r0 = tLDRi renamable $r5, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.i32) + tSTRspi killed renamable $r0, $sp, 7, 14 /* CC::al */, $noreg :: (store 4 into %stack.2) + renamable $r0 = tLDRi killed renamable $r5, 3, 14 /* CC::al */, $noreg :: (load 4 from %ir.i30) + tSTRspi killed renamable $r0, $sp, 6, 14 /* CC::al */, $noreg :: (store 4 into %stack.3) + $r0 = tMOVr killed $r3, 14 /* CC::al */, $noreg + renamable $r3 = tLDRspi $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.9) + + bb.3.bb37 (align 4): + successors: %bb.3(0x7c000000), %bb.4(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r6, $r8, $r9, $r10, $r12 + + renamable $r4 = tLDRspi $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) + $r7 = tMOVr killed $r6, 14 /* CC::al */, $noreg + $r5 = tMOVr $r9, 14 /* CC::al */, $noreg + $lr = tMOVr $r0, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMULL killed $r9, killed renamable $r4, 14 /* CC::al */, $noreg + renamable $r4 = tLDRspi $sp, 7, 14 /* CC::al */, $noreg :: (load 4 from %stack.2) + renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 1, 14 /* CC::al */, $noreg + renamable $r9, renamable $r1 = t2LDR_POST killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.i38) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r6, renamable $r11 = t2SMLAL killed renamable $r8, killed renamable $r4, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r4 = tLDRspi $sp, 6, 14 /* CC::al */, $noreg :: (load 4 from %stack.3) + $r8 = tMOVr $r5, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL renamable $r7, killed renamable $r4, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r4 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) + renamable $r6, renamable $r11 = t2SMLAL killed renamable $r12, renamable $r10, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + $r12 = tMOVr $r7, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL renamable $r9, killed renamable $r4, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + early-clobber renamable $r6, dead early-clobber renamable $r11 = MVE_ASRLr killed renamable $r6, killed renamable $r11, renamable $r3, 14 /* CC::al */, $noreg + early-clobber renamable $r2 = t2STR_POST renamable $r6, killed renamable $r2, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.i39) + t2LoopEnd killed renamable $lr, %bb.3, implicit-def dead $cpsr + tB %bb.4, 14 /* CC::al */, $noreg + + bb.4.bb72: + successors: %bb.5(0x80000000) + liveins: $r5, $r6, $r7, $r9 + + $r12 = tMOVr killed $r7, 14 /* CC::al */, $noreg + $r7, $r4 = t2LDRDi8 $sp, 16, 14 /* CC::al */, $noreg :: (load 4 from %stack.5), (load 4 from %stack.4) + $lr = t2ADDri $sp, 4, 14 /* CC::al */, $noreg, $noreg + $r8 = tMOVr killed $r5, 14 /* CC::al */, $noreg + t2LDMIA killed $lr, 14 /* CC::al */, $noreg, def $r2, def $r3, def $lr :: (load 4 from %stack.8), (load 4 from %stack.7), (load 4 from %stack.6) + + bb.5.bb74: + successors: %bb.1(0x7c000000) + liveins: $lr, $r2, $r3, $r4, $r6, $r7, $r8, $r9, $r12 + + t2STRDi8 killed $r9, killed $r8, $r7, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.i14), (store 4 into %ir.i81) + t2STRDi8 killed $r6, killed $r12, $r7, 8, 14 /* CC::al */, $noreg :: (store 4 into %ir.i84), (store 4 into %ir.i88) + renamable $r7, dead $cpsr = nuw tADDi8 killed renamable $r7, 16, 14 /* CC::al */, $noreg + renamable $r4, $cpsr = tSUBi8 killed renamable $r4, 1, 14 /* CC::al */, $noreg + $r5 = tMOVr killed $lr, 14 /* CC::al */, $noreg + $r1 = tMOVr $r2, 14 /* CC::al */, $noreg + t2IT 0, 4, implicit-def $itstate + $sp = frame-destroy tADDspi $sp, 10, 0 /* CC::eq */, $cpsr, implicit $itstate + $sp = frame-destroy t2LDMIA_RET $sp, 0 /* CC::eq */, killed $cpsr, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc, implicit $sp, implicit killed $r4, implicit killed $r5, implicit killed $r7, implicit killed $itstate + tB %bb.1, 14 /* CC::al */, $noreg + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize.mir new file mode 100644 index 00000000000000..f9b625c8141e7e --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/biquad-cascade-optsize.mir @@ -0,0 +1,396 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + %struct.arm_biquad_casd_df1_inst_q31 = type { i32*, i32*, i32, i32 } + + ; Function Attrs: optsize + define hidden void @arm_biquad_cascade_df1_q31(%struct.arm_biquad_casd_df1_inst_q31* nocapture readonly %arg, i32* nocapture readonly %arg1, i32* nocapture %arg2, i32 %arg3) #0 { + bb: + %i = bitcast %struct.arm_biquad_casd_df1_inst_q31* %arg to i32** + %i4 = load i32*, i32** %i, align 4 + %i5 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 1 + %i6 = load i32*, i32** %i5, align 4 + %i7 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 2 + %i8 = load i32, i32* %i7, align 4 + %i9 = sub i32 31, %i8 + %i10 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_q31, %struct.arm_biquad_casd_df1_inst_q31* %arg, i32 0, i32 3 + %i11 = load i32, i32* %i10, align 4 + br label %bb12 + + bb12: ; preds = %bb74, %bb + %i13 = phi i32* [ %i6, %bb ], [ %i18, %bb74 ] + %i14 = phi i32* [ %i4, %bb ], [ %i85, %bb74 ] + %i15 = phi i32* [ %arg1, %bb ], [ %arg2, %bb74 ] + %i16 = phi i32 [ %i11, %bb ], [ %i89, %bb74 ] + %i18 = getelementptr inbounds i32, i32* %i13, i32 5 + %i19 = load i32, i32* %i14, align 4 + %i20 = getelementptr inbounds i32, i32* %i14, i32 1 + %i21 = load i32, i32* %i20, align 4 + %i22 = getelementptr inbounds i32, i32* %i14, i32 2 + %i23 = load i32, i32* %i22, align 4 + %i24 = getelementptr inbounds i32, i32* %i14, i32 3 + %i25 = load i32, i32* %i24, align 4 + %i26 = call i1 @llvm.test.set.loop.iterations.i32(i32 %arg3) + br i1 %i26, label %bb27, label %bb74 + + bb27: ; preds = %bb12 + %i28 = getelementptr inbounds i32, i32* %i13, i32 4 + %i29 = load i32, i32* %i28, align 4 + %i30 = getelementptr inbounds i32, i32* %i13, i32 3 + %i31 = load i32, i32* %i30, align 4 + %i32 = getelementptr inbounds i32, i32* %i13, i32 2 + %i33 = load i32, i32* %i32, align 4 + %i34 = getelementptr inbounds i32, i32* %i13, i32 1 + %i35 = load i32, i32* %i34, align 4 + %i36 = load i32, i32* %i13, align 4 + br label %bb37 + + bb37: ; preds = %bb37, %bb27 + %lsr.iv = phi i32 [ %lsr.iv.next, %bb37 ], [ %arg3, %bb27 ] + %i38 = phi i32* [ %i15, %bb27 ], [ %i51, %bb37 ] + %i39 = phi i32* [ %arg2, %bb27 ], [ %i69, %bb37 ] + %i40 = phi i32 [ %i25, %bb27 ], [ %i41, %bb37 ] + %i41 = phi i32 [ %i23, %bb27 ], [ %i68, %bb37 ] + %i42 = phi i32 [ %i21, %bb27 ], [ %i43, %bb37 ] + %i43 = phi i32 [ %i19, %bb27 ], [ %i52, %bb37 ] + %i45 = sext i32 %i29 to i64 + %i46 = sext i32 %i31 to i64 + %i47 = sext i32 %i33 to i64 + %i48 = sext i32 %i35 to i64 + %i49 = sext i32 %i36 to i64 + %i50 = zext i32 %i9 to i64 + %i51 = getelementptr inbounds i32, i32* %i38, i32 1 + %i52 = load i32, i32* %i38, align 4 + %i53 = sext i32 %i52 to i64 + %i54 = mul nsw i64 %i53, %i49 + %i55 = sext i32 %i43 to i64 + %i56 = mul nsw i64 %i55, %i48 + %i57 = sext i32 %i42 to i64 + %i58 = mul nsw i64 %i57, %i47 + %i59 = sext i32 %i41 to i64 + %i60 = mul nsw i64 %i59, %i46 + %i61 = sext i32 %i40 to i64 + %i62 = mul nsw i64 %i61, %i45 + %i63 = add i64 %i58, %i56 + %i64 = add i64 %i63, %i60 + %i65 = add i64 %i64, %i62 + %i66 = add i64 %i65, %i54 + %i67 = ashr i64 %i66, %i50 + %i68 = trunc i64 %i67 to i32 + %i69 = getelementptr inbounds i32, i32* %i39, i32 1 + store i32 %i68, i32* %i39, align 4 + %i70 = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) + %i71 = icmp ne i32 %i70, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %i71, label %bb37, label %bb72 + + bb72: ; preds = %bb37 + %i73 = trunc i64 %i67 to i32 + br label %bb74 + + bb74: ; preds = %bb72, %bb12 + %i75 = phi i32 [ %i19, %bb12 ], [ %i52, %bb72 ] + %i76 = phi i32 [ %i21, %bb12 ], [ %i43, %bb72 ] + %i77 = phi i32 [ %i23, %bb12 ], [ %i73, %bb72 ] + %i78 = phi i32 [ %i25, %bb12 ], [ %i41, %bb72 ] + store i32 %i75, i32* %i14, align 4 + %i79 = bitcast i32* %i14 to i8* + %i80 = getelementptr inbounds i8, i8* %i79, i32 4 + %i81 = bitcast i8* %i80 to i32* + store i32 %i76, i32* %i81, align 4 + %i82 = bitcast i32* %i14 to i8* + %i83 = getelementptr inbounds i8, i8* %i82, i32 8 + %i84 = bitcast i8* %i83 to i32* + store i32 %i77, i32* %i84, align 4 + %i85 = getelementptr inbounds i32, i32* %i14, i32 4 + %i86 = bitcast i32* %i14 to i8* + %i87 = getelementptr inbounds i8, i8* %i86, i32 12 + %i88 = bitcast i8* %i87 to i32* + store i32 %i78, i32* %i88, align 4 + %i89 = add i32 %i16, -1 + %i90 = icmp eq i32 %i89, 0 + br i1 %i90, label %bb91, label %bb12 + + bb91: ; preds = %bb74 + ret void + } + + ; Function Attrs: noduplicate nounwind + declare i1 @llvm.test.set.loop.iterations.i32(i32) #1 + + ; Function Attrs: noduplicate nounwind + declare i32 @llvm.loop.decrement.reg.i32(i32, i32) #1 + + attributes #0 = { optsize "target-cpu"="cortex-m55" } + attributes #1 = { noduplicate nounwind "target-cpu"="cortex-m55" } + +... +--- +name: arm_biquad_cascade_df1_q31 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 76 + offsetAdjustment: 0 + maxAlignment: 4 + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -40, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -44, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 2, name: '', type: spill-slot, offset: -48, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 3, name: '', type: spill-slot, offset: -52, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 4, name: '', type: spill-slot, offset: -56, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 5, name: '', type: spill-slot, offset: -60, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 6, name: '', type: spill-slot, offset: -64, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 7, name: '', type: spill-slot, offset: -68, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 8, name: '', type: spill-slot, offset: -72, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 9, name: '', type: spill-slot, offset: -76, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 10, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 11, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r11', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 12, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r10', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 13, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r9', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 14, name: '', type: spill-slot, offset: -20, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r8', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 15, name: '', type: spill-slot, offset: -24, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 16, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 17, name: '', type: spill-slot, offset: -32, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 18, name: '', type: spill-slot, offset: -36, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: arm_biquad_cascade_df1_q31 + ; CHECK: bb.0.bb: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr + ; CHECK: $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 36 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r11, -8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r10, -12 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r9, -16 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r8, -20 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -24 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r6, -28 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r5, -32 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -36 + ; CHECK: $sp = frame-setup tSUBspi $sp, 10, 14 /* CC::al */, $noreg + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 76 + ; CHECK: $r6, $r5 = t2LDRDi8 $r0, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i7), (load 4 from %ir.i10) + ; CHECK: $r8 = tMOVr killed $r3, 14 /* CC::al */, $noreg + ; CHECK: $r3, $r7 = t2LDRDi8 killed $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i), (load 4 from %ir.i5) + ; CHECK: renamable $r0 = t2RSBri killed renamable $r6, 31, 14 /* CC::al */, $noreg, $noreg + ; CHECK: t2STMIA $sp, 14 /* CC::al */, $noreg, killed $r0, $r2, $r8 :: (store 4 into %stack.9), (store 4 into %stack.8), (store 4 into %stack.7) + ; CHECK: $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg + ; CHECK: renamable $r2 = tLDRspi $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.9) + ; CHECK: bb.1.bb12 (align 4): + ; CHECK: successors: %bb.2(0x40000000), %bb.5(0x40000000) + ; CHECK: liveins: $r1, $r2, $r3, $r5, $r7, $r8, $r12 + ; CHECK: $r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20) + ; CHECK: $r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24) + ; CHECK: dead $lr = t2WLS renamable $r8, %bb.5 + ; CHECK: bb.2.bb27: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12 + ; CHECK: t2STRDi8 killed $r3, killed $r5, $sp, 12, 14 /* CC::al */, $noreg :: (store 4 into %stack.6), (store 4 into %stack.5) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i13) + ; CHECK: tSTRspi killed renamable $r3, $sp, 9, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.i34) + ; CHECK: tSTRspi killed renamable $r3, $sp, 8, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.i32) + ; CHECK: tSTRspi killed renamable $r3, $sp, 7, 14 /* CC::al */, $noreg :: (store 4 into %stack.2) + ; CHECK: renamable $r3 = tLDRi renamable $r7, 3, 14 /* CC::al */, $noreg :: (load 4 from %ir.i30) + ; CHECK: t2STRDi8 $r7, killed $r3, $sp, 20, 14 /* CC::al */, $noreg :: (store 4 into %stack.4), (store 4 into %stack.3) + ; CHECK: renamable $r10 = t2LDRi12 killed renamable $r7, 16, 14 /* CC::al */, $noreg :: (load 4 from %ir.i28) + ; CHECK: bb.3.bb37 (align 4): + ; CHECK: successors: %bb.3(0x7c000000), %bb.4(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r4, $r6, $r8, $r9, $r10, $r12 + ; CHECK: $r7 = tMOVr killed $r6, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6 = tLDRspi $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) + ; CHECK: renamable $r3 = tLDRspi $sp, 7, 14 /* CC::al */, $noreg :: (load 4 from %stack.2) + ; CHECK: renamable $r6, renamable $r11 = t2SMULL $r9, killed renamable $r6, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL killed renamable $r4, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r3 = tLDRspi $sp, 6, 14 /* CC::al */, $noreg :: (load 4 from %stack.3) + ; CHECK: $r5 = tMOVr killed $r9, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL renamable $r7, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r9, renamable $r1 = t2LDR_POST killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.i38) + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL killed renamable $r0, renamable $r10, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: renamable $r0 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) + ; CHECK: $lr = tMOVr $r8, 14 /* CC::al */, $noreg + ; CHECK: renamable $r6, renamable $r11 = t2SMLAL renamable $r9, killed renamable $r0, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r6, dead early-clobber renamable $r11 = MVE_ASRLr killed renamable $r6, killed renamable $r11, renamable $r2, 14 /* CC::al */, $noreg + ; CHECK: early-clobber renamable $r12 = t2STR_POST renamable $r6, killed renamable $r12, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.i39) + ; CHECK: renamable $r8 = t2SUBri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $r0 = tMOVr $r7, 14 /* CC::al */, $noreg + ; CHECK: $r4 = tMOVr $r5, 14 /* CC::al */, $noreg + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.3 + ; CHECK: bb.4.bb72: + ; CHECK: successors: %bb.5(0x80000000) + ; CHECK: liveins: $r2, $r5, $r6, $r7, $r9 + ; CHECK: $r0 = tMOVr killed $r7, 14 /* CC::al */, $noreg + ; CHECK: $r7 = tADDrSPi $sp, 3, 14 /* CC::al */, $noreg + ; CHECK: $r4 = tMOVr killed $r5, 14 /* CC::al */, $noreg + ; CHECK: $r12, $r8 = t2LDRDi8 $sp, 4, 14 /* CC::al */, $noreg :: (load 4 from %stack.8), (load 4 from %stack.7) + ; CHECK: tLDMIA killed $r7, 14 /* CC::al */, $noreg, def $r3, def $r5, def $r7 :: (load 4 from %stack.6), (load 4 from %stack.5), (load 4 from %stack.4) + ; CHECK: bb.5.bb74: + ; CHECK: successors: %bb.6(0x04000000), %bb.1(0x7c000000) + ; CHECK: liveins: $r0, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r2 + ; CHECK: renamable $r7, dead $cpsr = nuw tADDi8 killed renamable $r7, 20, 14 /* CC::al */, $noreg + ; CHECK: t2STRDi8 killed $r9, killed $r4, $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.i14), (store 4 into %ir.i81) + ; CHECK: t2STRDi8 killed $r6, killed $r0, $r3, 8, 14 /* CC::al */, $noreg :: (store 4 into %ir.i84), (store 4 into %ir.i88) + ; CHECK: renamable $r3, dead $cpsr = nuw tADDi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg + ; CHECK: renamable $r5, $cpsr = tSUBi8 killed renamable $r5, 1, 14 /* CC::al */, $noreg + ; CHECK: $r1 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: tBcc %bb.1, 1 /* CC::ne */, killed $cpsr + ; CHECK: bb.6.bb91: + ; CHECK: $sp = frame-destroy tADDspi $sp, 10, 14 /* CC::al */, $noreg + ; CHECK: $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc + bb.0.bb: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $lr + + $sp = frame-setup t2STMDB_UPD $sp, 14 /* CC::al */, $noreg, killed $r4, killed $r5, killed $r6, killed $r7, killed $r8, killed $r9, killed $r10, killed $r11, killed $lr + frame-setup CFI_INSTRUCTION def_cfa_offset 36 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r11, -8 + frame-setup CFI_INSTRUCTION offset $r10, -12 + frame-setup CFI_INSTRUCTION offset $r9, -16 + frame-setup CFI_INSTRUCTION offset $r8, -20 + frame-setup CFI_INSTRUCTION offset $r7, -24 + frame-setup CFI_INSTRUCTION offset $r6, -28 + frame-setup CFI_INSTRUCTION offset $r5, -32 + frame-setup CFI_INSTRUCTION offset $r4, -36 + $sp = frame-setup tSUBspi $sp, 10, 14 /* CC::al */, $noreg + frame-setup CFI_INSTRUCTION def_cfa_offset 76 + $r6, $r5 = t2LDRDi8 $r0, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i7), (load 4 from %ir.i10) + $r8 = tMOVr killed $r3, 14 /* CC::al */, $noreg + $r3, $r7 = t2LDRDi8 killed $r0, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i), (load 4 from %ir.i5) + renamable $r0 = t2RSBri killed renamable $r6, 31, 14 /* CC::al */, $noreg, $noreg + t2STMIA $sp, 14 /* CC::al */, $noreg, killed $r0, $r2, $r8 :: (store 4 into %stack.9), (store 4 into %stack.8), (store 4 into %stack.7) + $r12 = tMOVr killed $r2, 14 /* CC::al */, $noreg + renamable $r2 = tLDRspi $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.9) + + bb.1.bb12 (align 4): + successors: %bb.2(0x40000000), %bb.5(0x40000000) + liveins: $r1, $r3, $r5, $r7, $r8, $r12, $r2 + + $r9, $r4 = t2LDRDi8 $r3, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i14), (load 4 from %ir.i20) + $r6, $r0 = t2LDRDi8 $r3, 8, 14 /* CC::al */, $noreg :: (load 4 from %ir.i22), (load 4 from %ir.i24) + t2WhileLoopStart renamable $r8, %bb.5, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.bb27: + successors: %bb.3(0x80000000) + liveins: $r0, $r1, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r2 + + t2STRDi8 killed $r3, killed $r5, $sp, 12, 14 /* CC::al */, $noreg :: (store 4 into %stack.6), (store 4 into %stack.5) + renamable $r3 = tLDRi renamable $r7, 0, 14 /* CC::al */, $noreg :: (load 4 from %ir.i13) + tSTRspi killed renamable $r3, $sp, 9, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) + renamable $r3 = tLDRi renamable $r7, 1, 14 /* CC::al */, $noreg :: (load 4 from %ir.i34) + tSTRspi killed renamable $r3, $sp, 8, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) + renamable $r3 = tLDRi renamable $r7, 2, 14 /* CC::al */, $noreg :: (load 4 from %ir.i32) + tSTRspi killed renamable $r3, $sp, 7, 14 /* CC::al */, $noreg :: (store 4 into %stack.2) + renamable $r3 = tLDRi renamable $r7, 3, 14 /* CC::al */, $noreg :: (load 4 from %ir.i30) + t2STRDi8 $r7, killed $r3, $sp, 20, 14 /* CC::al */, $noreg :: (store 4 into %stack.4), (store 4 into %stack.3) + renamable $r10 = t2LDRi12 killed renamable $r7, 16, 14 /* CC::al */, $noreg :: (load 4 from %ir.i28) + + bb.3.bb37 (align 4): + successors: %bb.3(0x7c000000), %bb.4(0x04000000) + liveins: $r0, $r1, $r2, $r4, $r6, $r8, $r9, $r10, $r12 + + $r7 = tMOVr killed $r6, 14 /* CC::al */, $noreg + renamable $r6 = tLDRspi $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) + renamable $r3 = tLDRspi $sp, 7, 14 /* CC::al */, $noreg :: (load 4 from %stack.2) + renamable $r6, renamable $r11 = t2SMULL $r9, killed renamable $r6, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL killed renamable $r4, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r3 = tLDRspi $sp, 6, 14 /* CC::al */, $noreg :: (load 4 from %stack.3) + $r5 = tMOVr killed $r9, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL renamable $r7, killed renamable $r3, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r9, renamable $r1 = t2LDR_POST killed renamable $r1, 4, 14 /* CC::al */, $noreg :: (load 4 from %ir.i38) + renamable $r6, renamable $r11 = t2SMLAL killed renamable $r0, renamable $r10, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + renamable $r0 = tLDRspi $sp, 9, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) + $lr = tMOVr $r8, 14 /* CC::al */, $noreg + renamable $r6, renamable $r11 = t2SMLAL renamable $r9, killed renamable $r0, killed renamable $r6, killed renamable $r11, 14 /* CC::al */, $noreg + early-clobber renamable $r6, dead early-clobber renamable $r11 = MVE_ASRLr killed renamable $r6, killed renamable $r11, renamable $r2, 14 /* CC::al */, $noreg + early-clobber renamable $r12 = t2STR_POST renamable $r6, killed renamable $r12, 4, 14 /* CC::al */, $noreg :: (store 4 into %ir.i39) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $r8 = t2SUBri killed renamable $r8, 1, 14 /* CC::al */, $noreg, $noreg + $r0 = tMOVr $r7, 14 /* CC::al */, $noreg + $r4 = tMOVr $r5, 14 /* CC::al */, $noreg + t2LoopEnd killed renamable $lr, %bb.3, implicit-def dead $cpsr + tB %bb.4, 14 /* CC::al */, $noreg + + bb.4.bb72: + successors: %bb.5(0x80000000) + liveins: $r5, $r6, $r7, $r9, $r2 + + $r0 = tMOVr killed $r7, 14 /* CC::al */, $noreg + $r7 = tADDrSPi $sp, 3, 14 /* CC::al */, $noreg + $r4 = tMOVr killed $r5, 14 /* CC::al */, $noreg + $r12, $r8 = t2LDRDi8 $sp, 4, 14 /* CC::al */, $noreg :: (load 4 from %stack.8), (load 4 from %stack.7) + tLDMIA killed $r7, 14 /* CC::al */, $noreg, def $r3, def $r5, def $r7 :: (load 4 from %stack.6), (load 4 from %stack.5), (load 4 from %stack.4) + + bb.5.bb74: + successors: %bb.6(0x04000000), %bb.1(0x7c000000) + liveins: $r0, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r12, $r2 + + renamable $r7, dead $cpsr = nuw tADDi8 killed renamable $r7, 20, 14 /* CC::al */, $noreg + t2STRDi8 killed $r9, killed $r4, $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.i14), (store 4 into %ir.i81) + t2STRDi8 killed $r6, killed $r0, $r3, 8, 14 /* CC::al */, $noreg :: (store 4 into %ir.i84), (store 4 into %ir.i88) + renamable $r3, dead $cpsr = nuw tADDi8 killed renamable $r3, 16, 14 /* CC::al */, $noreg + renamable $r5, $cpsr = tSUBi8 killed renamable $r5, 1, 14 /* CC::al */, $noreg + $r1 = tMOVr $r12, 14 /* CC::al */, $noreg + tBcc %bb.1, 1 /* CC::ne */, killed $cpsr + + bb.6.bb91: + $sp = frame-destroy tADDspi $sp, 10, 14 /* CC::al */, $noreg + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc + +... From 0b17d4754a94b7129c2483762acd586783802b12 Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Wed, 30 Sep 2020 07:13:59 +0000 Subject: [PATCH 7/8] [mlir][Linalg] Tile sizes for Conv ops vectorization added as pass arguments Current setup for conv op vectorization does not enable user to specify tile sizes as well as dimensions for vectorization. In this commit we change that by adding tile sizes as pass arguments. Every dimension with corresponding tile size > 1 is automatically vectorized. Differential Revision: https://reviews.llvm.org/D88533 --- .../Dialect/Linalg/Transforms/Transforms.h | 11 ++-- .../Dialect/Linalg/CPU/test-conv-1d-call.mlir | 4 +- .../Linalg/CPU/test-conv-1d-ncw-call.mlir | 4 +- .../Linalg/CPU/test-conv-1d-nwc-call.mlir | 4 +- .../Dialect/Linalg/CPU/test-conv-2d-call.mlir | 4 +- .../Linalg/CPU/test-conv-2d-nchw-call.mlir | 4 +- .../Linalg/CPU/test-conv-2d-nhwc-call.mlir | 4 +- .../Dialect/Linalg/CPU/test-conv-3d-call.mlir | 4 +- .../Linalg/CPU/test-conv-3d-ncdhw-call.mlir | 4 +- .../Linalg/CPU/test-conv-3d-ndhwc-call.mlir | 4 +- .../Linalg/Transforms/Vectorization.cpp | 51 +++++++++---------- .../LinalgToVector/linalg-to-vector.mlir | 2 +- .../lib/Transforms/TestConvVectorization.cpp | 13 ++++- 13 files changed, 59 insertions(+), 54 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index b188fde5d801aa..00a094d7207674 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -32,7 +32,8 @@ struct TiledLinalgOp { /// Populates patterns for vectorization of all ConvN-D ops. void populateConvVectorizationPatterns( - MLIRContext *context, SmallVectorImpl &patterns); + MLIRContext *context, SmallVectorImpl &patterns, + ArrayRef tileSizes); /// Performs standalone tiling of a single LinalgOp by `tileSizes`. /// and permute the loop nest according to `interchangeVector` @@ -549,8 +550,8 @@ struct AffineMinSCFCanonicalizationPattern /// false of size 1. This ensures that the ConvOp can be lowered to vector /// contraction of dimensions marked in the *mask* as true. /// -/// A good example is ConvNHWCOp which is 2D Conv op with channels as the last -/// dimension. For this op we contract last 3 dimensions. +/// A good example for vectorization is ConvNHWCOp which is 2D Conv op +/// with channels as the last dimension. Let's vectorize last 3 dimensions. /// The initial op definition looks like this: /// ``` /// linalg.conv_2d_nhwc %arg0, %arg1, %arg2 : @@ -589,10 +590,6 @@ class ConvOpVectorization : public OpRewritePattern { LogicalResult matchAndRewrite(ConvOp minOp, PatternRewriter &rewriter) const override; - - // TODO: Make these pass arguments. - static const int tileSize = 3; - static const int noTile = 1; }; //===----------------------------------------------------------------------===// diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir index 97ea95c8bcd1ac..7cc0875b335397 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=4" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir index dcfcc9b62bbc11..7f90ac675f728e 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-ncw-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,4" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir index 2e79b46801bca2..3eb0959ddda164 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-1d-nwc-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,4" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir index e271b0a009b6f4..787cbf5d268bb2 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,2" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir index e27c40524fccad..c6236db6a05a2b 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nchw-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,0,4,4" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir index b5b4a5c82c0959..3213b7dc5fe239 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-2d-nhwc-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,3,2" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir index 12ea9469666032..8020f3ac017f47 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,2,2" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir index e36abc83b700c5..830b5402c2a4c5 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ncdhw-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,5,5,5" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir index b302b3e0d8bdf4..0b25ea09157cd8 100644 --- a/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir +++ b/mlir/integration_test/Dialect/Linalg/CPU/test-conv-3d-ndhwc-call.mlir @@ -9,13 +9,13 @@ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s -// RUN: mlir-opt %s -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s // RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,5,5,5" \ -// RUN: -test-conv-vectorization -convert-linalg-to-llvm | \ +// RUN: -test-conv-vectorization="tile-sizes=1,1,1,1,1,3,3,3,3" -convert-linalg-to-llvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 9a225dd81c79cd..4430c34af1e9ec 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -385,16 +385,19 @@ LogicalResult ConvOpVectorization::matchAndRewrite( return failure(); SmallVector mapping; - // Fail to apply when the size of not vectorized dimension is not 1 or - // when the size of vectorized dimension is not dimSize. + SmallVector vectorDims; + // Fail to apply when the size of not vectorized dimension is not 1. for (unsigned i = 0; i < N; i++) { if (!mask[i] && (inShape[i] != 1 || kShape[i] != 1)) return failure(); - if (mask[i] && (inShape[i] != tileSize || kShape[i] != tileSize)) + + if (mask[i] && inShape[i] != kShape[i]) return failure(); - if (mask[i]) + if (mask[i]) { mapping.push_back(getAffineDimExpr(i, context)); + vectorDims.push_back(inShape[i]); + } } Value input = op.getInput(0); @@ -407,8 +410,7 @@ LogicalResult ConvOpVectorization::matchAndRewrite( auto map = AffineMap::get(rank, 0, mapping, context); SmallVector zeros(rank, std_constant_index(0)); - auto vecType = - VectorType::get(SmallVector(numDims, tileSize), elemType); + auto vecType = VectorType::get(vectorDims, elemType); auto inputVec = vector_transfer_read(vecType, input, zeros, map); auto kernelVec = vector_transfer_read(vecType, kernel, zeros, map); @@ -443,6 +445,9 @@ populateVectorizationPatterns(OwningRewritePatternList &tilingPatterns, OwningRewritePatternList &vectorizationPatterns, ArrayRef tileSizes, MLIRContext *context) { + if (tileSizes.size() < N) + return; + constexpr static StringRef kTiledMarker = "TILED"; constexpr static StringRef kPromotedMarker = "PROMOTED"; tilingPatterns.insert>( @@ -457,49 +462,41 @@ populateVectorizationPatterns(OwningRewritePatternList &tilingPatterns, SmallVector mask(N); int offset = tileSizes.size() - N; std::transform(tileSizes.begin() + offset, tileSizes.end(), mask.begin(), - [](int64_t i) -> bool { return i != ConvOpConst::noTile; }); + [](int64_t i) -> bool { return i > 1; }); vectorizationPatterns.insert>(context, mask); } void mlir::linalg::populateConvVectorizationPatterns( - MLIRContext *context, SmallVectorImpl &patterns) { - const int64_t tileSize = ConvOpConst::tileSize; - const int64_t noTile = ConvOpConst::noTile; - auto makeTileSizes = [&](unsigned numNoTile, unsigned numTile) { - SmallVector result(numNoTile, noTile); - result.append(numTile, tileSize); - return result; - }; - + MLIRContext *context, SmallVectorImpl &patterns, + ArrayRef tileSizes) { OwningRewritePatternList tiling, promotion, vectorization; - populateVectorizationPatterns( - tiling, promotion, vectorization, - makeTileSizes(/*numNoTile=*/1, /*numTile*/ 1), context); + populateVectorizationPatterns(tiling, promotion, vectorization, + tileSizes, context); populateVectorizationPatterns(tiling, promotion, vectorization, - makeTileSizes(3, 2), context); + tileSizes, context); populateVectorizationPatterns(tiling, promotion, vectorization, - makeTileSizes(3, 2), context); + tileSizes, context); populateVectorizationPatterns(tiling, promotion, vectorization, - makeTileSizes(2, 2), context); + tileSizes, context); populateVectorizationPatterns(tiling, promotion, vectorization, - makeTileSizes(4, 3), context); + tileSizes, context); populateVectorizationPatterns(tiling, promotion, vectorization, - makeTileSizes(4, 3), context); + tileSizes, context); populateVectorizationPatterns(tiling, promotion, vectorization, - makeTileSizes(3, 3), context); + tileSizes, context); populateVectorizationPatterns( - tiling, promotion, vectorization, makeTileSizes(5, 4), context); + tiling, promotion, vectorization, tileSizes, context); populateVectorizationPatterns( - tiling, promotion, vectorization, makeTileSizes(5, 4), context); + tiling, promotion, vectorization, tileSizes, context); patterns.push_back(std::move(tiling)); patterns.push_back(std::move(promotion)); diff --git a/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir b/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir index eeb2ca31fd2a9a..e1bb7f3caabb30 100644 --- a/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir +++ b/mlir/test/Conversion/LinalgToVector/linalg-to-vector.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -test-conv-vectorization --cse | FileCheck %s +// RUN: mlir-opt %s -test-conv-vectorization="tile-sizes=1,3" --cse | FileCheck %s // CHECK-DAG: #[[$map0:.*]] = affine_map<(d0)[s0] -> (1, -d0 + s0)> // CHECK-DAG: #[[$map1:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> diff --git a/mlir/test/lib/Transforms/TestConvVectorization.cpp b/mlir/test/lib/Transforms/TestConvVectorization.cpp index c90d8058de3291..79b6464f3b4cb6 100644 --- a/mlir/test/lib/Transforms/TestConvVectorization.cpp +++ b/mlir/test/lib/Transforms/TestConvVectorization.cpp @@ -24,6 +24,13 @@ namespace { /// A pass converting MLIR Linalg ops into Vector ops. class TestConvVectorization : public PassWrapper> { +public: + TestConvVectorization() = default; + TestConvVectorization(const TestConvVectorization &) {} + explicit TestConvVectorization(ArrayRef tileSizesParam) { + tileSizes = tileSizesParam; + } + void runOnOperation() override; void getDependentDialects(DialectRegistry ®istry) const override { @@ -33,6 +40,10 @@ class TestConvVectorization registry.insert(); registry.insert(); } + + ListOption tileSizes{ + *this, "tile-sizes", llvm::cl::desc("Vectorization sizes."), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated}; }; } // namespace @@ -47,7 +58,7 @@ void TestConvVectorization::runOnOperation() { target.addLegalOp(); SmallVector stage1Patterns; - linalg::populateConvVectorizationPatterns(context, stage1Patterns); + linalg::populateConvVectorizationPatterns(context, stage1Patterns, tileSizes); OwningRewritePatternList stage2Patterns = linalg::getLinalgTilingCanonicalizationPatterns(context); From 0eab9d5823815c6520697f8d725c402c88e5d050 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 30 Sep 2020 12:39:30 +0100 Subject: [PATCH 8/8] [SCEV] Verify that all mapped SCEV AddRecs refer to valid loops. This check helps to guard against cases where expressions referring to invalidated/deleted loops are not properly invalidated. The additional check is motivated by the reproducer shared for 8fdac7cb7abb and I think in general make sense as a sanity check. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D88166 --- llvm/lib/Analysis/ScalarEvolution.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 756710909ac798..8759f86e031d21 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -12005,6 +12005,25 @@ void ScalarEvolution::verify() const { std::abort(); } } + + // Collect all valid loops currently in LoopInfo. + SmallPtrSet ValidLoops; + SmallVector Worklist(LI.begin(), LI.end()); + while (!Worklist.empty()) { + Loop *L = Worklist.pop_back_val(); + if (ValidLoops.contains(L)) + continue; + ValidLoops.insert(L); + Worklist.append(L->begin(), L->end()); + } + // Check for SCEV expressions referencing invalid/deleted loops. + for (auto &KV : ValueExprMap) { + auto *AR = dyn_cast(KV.second); + if (!AR) + continue; + assert(ValidLoops.contains(AR->getLoop()) && + "AddRec references invalid loop"); + } } bool ScalarEvolution::invalidate(